54 files changed, 3858 insertions, 1940 deletions
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 4c70b98..e9a6faa 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -187,28 +187,29 @@ if ("lldb" IN_LIST LLVM_ENABLE_PROJECTS)
 endif ()
 
 if ("libc" IN_LIST LLVM_ENABLE_PROJECTS)
-  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated.  Please use "
+  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated now, and will "
+    "become a fatal error in a future release. Please use "
     "-DLLVM_ENABLE_RUNTIMES=libc or see the instructions at "
     "https://libc.llvm.org/ for building the runtimes.")
 endif()
 
 if ("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=compiler-rt is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=compiler-rt or see the instructions at "
     "https://compiler-rt.llvm.org/ for building the runtimes.")
 endif()
 
 if ("offload" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=offload is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=offload or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
 
 if ("openmp" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=openmp is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=openmp or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
@@ -221,7 +222,7 @@ endif ()
 
 if ("libclc" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=libclc is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=libclc or see the instructions at "
     "https://libclc.llvm.org/ for building the runtimes.")
 endif()
diff --git a/llvm/cmake/modules/LLVMProcessSources.cmake b/llvm/cmake/modules/LLVMProcessSources.cmake
index cf358a8..0670d60 100644
--- a/llvm/cmake/modules/LLVMProcessSources.cmake
+++ b/llvm/cmake/modules/LLVMProcessSources.cmake
@@ -58,21 +58,6 @@ function(llvm_process_sources OUT_VAR)
   set(sources ${ARG_UNPARSED_ARGUMENTS})
   llvm_check_source_file_list(${sources})
 
-  # Don't generate __SHORT_FILE__ on VS builds as it can prevent build parallelisation.
-  if(NOT CMAKE_GENERATOR MATCHES "Visual Studio")
-    foreach(fn ${sources})
-      get_filename_component(suf ${fn} EXT)
-      if("${suf}" STREQUAL ".cpp" OR "${suf}" STREQUAL ".c")
-        get_filename_component(short_name ${fn} NAME)
-        set_property(
-            SOURCE ${fn}
-            APPEND
-            PROPERTY COMPILE_DEFINITIONS __SHORT_FILE__="${short_name}")
-      endif()
-    endforeach()
-  endif()
-
-
   # This adds .td and .h files to the Visual Studio solution:
   add_td_sources(sources)
   find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst
index 02344bc..c27f603 100644
--- a/llvm/docs/MergeFunctions.rst
+++ b/llvm/docs/MergeFunctions.rst
@@ -7,7 +7,7 @@ MergeFunctions pass, how it works
 
 Introduction
 ============
-Sometimes code contains equal functions, or functions that does exactly the same
+Sometimes code contains equal functions, or functions that do exactly the same
 thing even though they are non-equal on the IR level (e.g.: multiplication on 2
 and 'shl 1'). It could happen due to several reasons: mainly, the usage of
 templates and automatic code generators. Though, sometimes the user itself could
@@ -16,7 +16,7 @@ write the same thing twice :-)
 The main purpose of this pass is to recognize such functions and merge them.
 
 This document is the extension to pass comments and describes the pass logic. It
-describes the algorithm that is used in order to compare functions and
+describes the algorithm used to compare functions and
 explains how we could combine equal functions correctly to keep the module
 valid.
 
@@ -58,7 +58,7 @@ It's especially important to understand chapter 3 of tutorial:
 
 :doc:`tutorial/LangImpl03`
 
-The reader should also know how passes work in LLVM. They could use this
+The reader should also know how passes work in LLVM. They can use this
 article as a reference and start point here:
 
 :doc:`WritingAnLLVMPass`
@@ -68,7 +68,7 @@ debugging and bug-fixing.
 
 Narrative structure
 -------------------
-The article consists of three parts. The first part explains pass functionality
+This article consists of three parts. The first part explains pass functionality
 on the top-level. The second part describes the comparison procedure itself.
 The third part describes the merging process.
 
@@ -130,7 +130,7 @@ access lookup? The answer is: "yes".
 
 Random-access
 """""""""""""
-How it could this be done? Just convert each function to a number, and gather
+How can this be done? Just convert each function to a number, and gather
 all of them in a special hash-table. Functions with equal hashes are equal.
 Good hashing means, that every function part must be taken into account. That
 means we have to convert every function part into some number, and then add it
@@ -190,17 +190,17 @@ The algorithm is pretty simple:
 
 1. Put all module's functions into the *worklist*.
 
-2. Scan *worklist*'s functions twice: first enumerate only strong functions and
+2. Scan *worklist*'s functions twice: first, enumerate only strong functions and
 then only weak ones:
 
    2.1. Loop body: take a function from *worklist*  (call it *FCur*) and try to
    insert it into *FnTree*: check whether *FCur* is equal to one of functions
    in *FnTree*. If there *is* an equal function in *FnTree*
-   (call it *FExists*): merge function *FCur* with *FExists*. Otherwise add
+   (call it *FExists*): merge function *FCur* with *FExists*. Otherwise, add
    the function from the *worklist* to *FnTree*.
 
 3. Once the *worklist* scanning and merging operations are complete, check the
-*Deferred* list. If it is not empty: refill the *worklist* contents with
+*Deferred* list. If it is not empty, refill the *worklist* contents with
 *Deferred* list and redo step 2, if the *Deferred* list is empty, then exit
 from method.
 
@@ -249,14 +249,14 @@ Below, we will use the following operations:
 
 The rest of the article is based on *MergeFunctions.cpp* source code
 (found in *<llvm_dir>/lib/Transforms/IPO/MergeFunctions.cpp*). We would like
-to ask reader to keep this file open, so we could use it as a reference
+to ask the reader to keep this file open, so we could use it as a reference
 for further explanations.
 
 Now, we're ready to proceed to the next chapter and see how it works.
 
 Functions comparison
 ====================
-At first, let's define how exactly we compare complex objects.
+First, let's define exactly how we compare complex objects.
 
 Complex object comparison (function, basic-block, etc) is mostly based on its
 sub-object comparison results. It is similar to the next "tree" objects
@@ -307,7 +307,7 @@ to those we met later in function body (value we met first would be *less*).
 This is done by “``FunctionComparator::cmpValues(const Value*, const Value*)``”
 method (will be described a bit later).
 
-4. Function body comparison. As it written in method comments:
+4. Function body comparison. As written in method comments:
 
 “We do a CFG-ordered walk since the actual ordering of the blocks in the linked
 list is immaterial. Our walk starts at the entry block for both functions, then
@@ -477,7 +477,7 @@ Of course, we can combine insertion and comparison:
     = sn_mapR.insert(std::make_pair(Right, sn_mapR.size()));
   return cmpNumbers(LeftRes.first->second, RightRes.first->second);
 
-Let's look, how whole method could be implemented.
+Let's look at how the whole method could be implemented.
 
 1. We have to start with the bad news. Consider function self and
 cross-referencing cases:
@@ -519,7 +519,7 @@ the result of numbers comparison:
    if (LeftRes.first->second < RightRes.first->second) return -1;
    return 1;
 
-Now when *cmpValues* returns 0, we can proceed the comparison procedure.
+Now, when *cmpValues* returns 0, we can proceed with the comparison procedure.
 Otherwise, if we get (-1 or 1), we need to pass this result to the top level,
 and finish comparison procedure.
 
@@ -549,7 +549,7 @@ losslessly bitcasted to each other. The further explanation is modification of
    2.1.3.1. If types are vectors, compare their bitwidth using the
    *cmpNumbers*. If result is not 0, return it.
 
-   2.1.3.2. Different types, but not a vectors:
+   2.1.3.2. Different types, but not vectors:
 
    * if both of them are pointers, good for us, we can proceed to step 3.
    * if one of types is pointer, return result of *isPointer* flags
@@ -654,7 +654,7 @@ O(N*N) to O(log(N)).
 
 Merging process, mergeTwoFunctions
 ==================================
-Once *MergeFunctions* detected that current function (*G*) is equal to one that
+Once *MergeFunctions* detects that current function (*G*) is equal to one that
 were analyzed before (function *F*) it calls ``mergeTwoFunctions(Function*,
 Function*)``.
 
@@ -664,7 +664,7 @@ Operation affects ``FnTree`` contents with next way: *F* will stay in
 functions that calls *G* would be put into ``Deferred`` set and removed from
 ``FnTree``, and analyzed again.
 
-The approach is next:
+The approach is as follows:
 
 1. Most wished case: when we can use alias and both of *F* and *G* are weak. We
 make both of them with aliases to the third strong function *H*. Actually *H*
@@ -691,12 +691,12 @@ ok: we can use alias to *F* instead of *G* or change call instructions itself.
 
 HasGlobalAliases, removeUsers
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-First consider the case when we have global aliases of one function name to
+First, consider the case when we have global aliases of one function name to
 another. Our purpose is  make both of them with aliases to the third strong
 function. Though if we keep *F* alive and without major changes we can leave it
 in ``FnTree``. Try to combine these two goals.
 
-Do stub replacement of *F* itself with an alias to *F*.
+Do a stub replacement of *F* itself with an alias to *F*.
 
 1. Create stub function *H*, with the same name and attributes like function
 *F*. It takes maximum alignment of *F* and *G*.
@@ -725,7 +725,7 @@ also have alias to *F*.
 
 No global aliases, replaceDirectCallers
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-If global aliases are not supported. We call ``replaceDirectCallers``. Just
+If global aliases are not supported, we call ``replaceDirectCallers``. Just
 go through all calls of *G* and replace it with calls of *F*. If you look into
 the method you will see that it scans all uses of *G* too, and if use is callee
 (if user is call instruction and *G* is used as what to be called), we replace
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index d28eb68..2dc8f9f 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -971,6 +971,10 @@ Syntax:
   declare void  @llvm.nvvm.prefetch.L1(ptr %ptr)
   declare void  @llvm.nvvm.prefetch.L2(ptr %ptr)
   
+  declare void  @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+  declare void  @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+  declare void  @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)  
+  
   declare void  @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
   declare void  @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)
 
@@ -983,7 +987,10 @@ The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic
 correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions. 
 The '``prefetch.*``' instructions bring the cache line containing the
 specified address in global or local memory address space into the 
-specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line 
+specified cache level (L1 or L2). If the '``.tensormap``' qualifier is specified then the 
+prefetch instruction brings the cache line containing the specified address in the 
+'``.const``' or '``.param memory``' state space for subsequent use by the '``cp.async.bulk.tensor``' 
+instruction. The '`prefetchu.*``' instruction brings the cache line 
 containing the specified generic address into the specified uniform cache level.
 If no address space is specified, it is assumed to be generic address. The intrinsic 
 uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 0c49fc8..44a1366 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -73,6 +73,7 @@ Changes to Vectorizers
 
 * Added initial support for copyable elements in SLP, which models copyable
   elements as add <element>, 0, i.e. uses identity constants for missing lanes.
+* SLP vectorizer supports initial recognition of FMA/FMAD pattern
 
 Changes to the AArch64 Backend
 ------------------------------
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index ea5eac4..1f27213 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -454,28 +454,28 @@ protected:
     return NextPowerOf2(NumEntries * 4 / 3 + 1);
   }
 
-  void moveFromOldBuckets(BucketT *OldBucketsBegin, BucketT *OldBucketsEnd) {
+  void moveFromOldBuckets(iterator_range<BucketT *> OldBuckets) {
     initEmpty();
 
     // Insert all the old elements.
     const KeyT EmptyKey = getEmptyKey();
     const KeyT TombstoneKey = getTombstoneKey();
-    for (BucketT *B = OldBucketsBegin, *E = OldBucketsEnd; B != E; ++B) {
-      if (!KeyInfoT::isEqual(B->getFirst(), EmptyKey) &&
-          !KeyInfoT::isEqual(B->getFirst(), TombstoneKey)) {
+    for (BucketT &B : OldBuckets) {
+      if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
         // Insert the key/value into the new table.
         BucketT *DestBucket;
-        bool FoundVal = LookupBucketFor(B->getFirst(), DestBucket);
+        bool FoundVal = LookupBucketFor(B.getFirst(), DestBucket);
         (void)FoundVal; // silence warning.
         assert(!FoundVal && "Key already in new map?");
-        DestBucket->getFirst() = std::move(B->getFirst());
-        ::new (&DestBucket->getSecond()) ValueT(std::move(B->getSecond()));
+        DestBucket->getFirst() = std::move(B.getFirst());
+        ::new (&DestBucket->getSecond()) ValueT(std::move(B.getSecond()));
         incrementNumEntries();
 
         // Free the value.
-        B->getSecond().~ValueT();
+        B.getSecond().~ValueT();
       }
-      B->getFirst().~KeyT();
+      B.getFirst().~KeyT();
     }
   }
 
@@ -867,7 +867,8 @@ public:
       return;
     }
 
-    this->moveFromOldBuckets(OldBuckets, OldBuckets + OldNumBuckets);
+    this->moveFromOldBuckets(
+        llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
 
     // Free the old table.
     deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
@@ -952,6 +953,9 @@ class SmallDenseMap
   struct LargeRep {
     BucketT *Buckets;
     unsigned NumBuckets;
+    iterator_range<BucketT *> buckets() {
+      return llvm::make_range(Buckets, Buckets + NumBuckets);
+    }
   };
 
   /// A "union" of an inline bucket array and the struct representing
@@ -1129,7 +1133,7 @@ public:
         Small = false;
         new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
       }
-      this->moveFromOldBuckets(TmpBegin, TmpEnd);
+      this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
       return;
     }
 
@@ -1141,8 +1145,7 @@ public:
       new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
     }
 
-    this->moveFromOldBuckets(OldRep.Buckets,
-                             OldRep.Buckets + OldRep.NumBuckets);
+    this->moveFromOldBuckets(OldRep.buckets());
 
     // Free the old table.
     deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index f98bd68..1679596 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -47,994 +47,908 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-  class AAResults;
-  template <typename T> class ArrayRef;
-  class Loop;
-  class LoopInfo;
-  class SCEVConstant;
-  class raw_ostream;
-
-  /// Dependence - This class represents a dependence between two memory
-  /// memory references in a function. It contains minimal information and
-  /// is used in the very common situation where the compiler is unable to
-  /// determine anything beyond the existence of a dependence; that is, it
-  /// represents a confused dependence (see also FullDependence). In most
-  /// cases (for output, flow, and anti dependences), the dependence implies
-  /// an ordering, where the source must precede the destination; in contrast,
-  /// input dependences are unordered.
-  ///
-  /// When a dependence graph is built, each Dependence will be a member of
-  /// the set of predecessor edges for its destination instruction and a set
-  /// if successor edges for its source instruction. These sets are represented
-  /// as singly-linked lists, with the "next" fields stored in the dependence
-  /// itelf.
-  class LLVM_ABI Dependence {
-  protected:
-    Dependence(Dependence &&) = default;
-    Dependence &operator=(Dependence &&) = default;
-
-  public:
-    Dependence(Instruction *Source, Instruction *Destination,
-               const SCEVUnionPredicate &A)
-        : Src(Source), Dst(Destination), Assumptions(A) {}
-    virtual ~Dependence() = default;
-
-    /// Dependence::DVEntry - Each level in the distance/direction vector
-    /// has a direction (or perhaps a union of several directions), and
-    /// perhaps a distance.
-    struct DVEntry {
-      enum : unsigned char {
-        NONE = 0,
-        LT = 1,
-        EQ = 2,
-        LE = 3,
-        GT = 4,
-        NE = 5,
-        GE = 6,
-        ALL = 7
-      };
-      unsigned char Direction : 3; // Init to ALL, then refine.
-      bool Scalar    : 1; // Init to true.
-      bool PeelFirst : 1; // Peeling the first iteration will break dependence.
-      bool PeelLast  : 1; // Peeling the last iteration will break the dependence.
-      bool Splitable : 1; // Splitting the loop will break dependence.
-      const SCEV *Distance = nullptr; // NULL implies no distance available.
-      DVEntry()
-          : Direction(ALL), Scalar(true), PeelFirst(false), PeelLast(false),
-            Splitable(false) {}
+class AAResults;
+template <typename T> class ArrayRef;
+class Loop;
+class LoopInfo;
+class SCEVConstant;
+class raw_ostream;
+
+/// Dependence - This class represents a dependence between two memory
+/// memory references in a function. It contains minimal information and
+/// is used in the very common situation where the compiler is unable to
+/// determine anything beyond the existence of a dependence; that is, it
+/// represents a confused dependence (see also FullDependence). In most
+/// cases (for output, flow, and anti dependences), the dependence implies
+/// an ordering, where the source must precede the destination; in contrast,
+/// input dependences are unordered.
+///
+/// When a dependence graph is built, each Dependence will be a member of
+/// the set of predecessor edges for its destination instruction and a set
+/// if successor edges for its source instruction. These sets are represented
+/// as singly-linked lists, with the "next" fields stored in the dependence
+/// itelf.
+class LLVM_ABI Dependence {
+protected:
+  Dependence(Dependence &&) = default;
+  Dependence &operator=(Dependence &&) = default;
+
+public:
+  Dependence(Instruction *Source, Instruction *Destination,
+             const SCEVUnionPredicate &A)
+      : Src(Source), Dst(Destination), Assumptions(A) {}
+  virtual ~Dependence() = default;
+
+  /// Dependence::DVEntry - Each level in the distance/direction vector
+  /// has a direction (or perhaps a union of several directions), and
+  /// perhaps a distance.
+  struct DVEntry {
+    enum : unsigned char {
+      NONE = 0,
+      LT = 1,
+      EQ = 2,
+      LE = 3,
+      GT = 4,
+      NE = 5,
+      GE = 6,
+      ALL = 7
     };
+    unsigned char Direction : 3; // Init to ALL, then refine.
+    bool Scalar : 1;             // Init to true.
+    bool PeelFirst : 1; // Peeling the first iteration will break dependence.
+    bool PeelLast : 1;  // Peeling the last iteration will break the dependence.
+    bool Splitable : 1; // Splitting the loop will break dependence.
+    const SCEV *Distance = nullptr; // NULL implies no distance available.
+    DVEntry()
+        : Direction(ALL), Scalar(true), PeelFirst(false), PeelLast(false),
+          Splitable(false) {}
+  };
 
-    /// getSrc - Returns the source instruction for this dependence.
-    ///
-    Instruction *getSrc() const { return Src; }
-
-    /// getDst - Returns the destination instruction for this dependence.
-    ///
-    Instruction *getDst() const { return Dst; }
-
-    /// isInput - Returns true if this is an input dependence.
-    ///
-    bool isInput() const;
-
-    /// isOutput - Returns true if this is an output dependence.
-    ///
-    bool isOutput() const;
-
-    /// isFlow - Returns true if this is a flow (aka true) dependence.
-    ///
-    bool isFlow() const;
-
-    /// isAnti - Returns true if this is an anti dependence.
-    ///
-    bool isAnti() const;
-
-    /// isOrdered - Returns true if dependence is Output, Flow, or Anti
-    ///
-    bool isOrdered() const { return isOutput() || isFlow() || isAnti(); }
+  /// getSrc - Returns the source instruction for this dependence.
+  Instruction *getSrc() const { return Src; }
 
-    /// isUnordered - Returns true if dependence is Input
-    ///
-    bool isUnordered() const { return isInput(); }
+  /// getDst - Returns the destination instruction for this dependence.
+  Instruction *getDst() const { return Dst; }
 
-    /// isLoopIndependent - Returns true if this is a loop-independent
-    /// dependence.
-    virtual bool isLoopIndependent() const { return true; }
+  /// isInput - Returns true if this is an input dependence.
+  bool isInput() const;
 
-    /// isConfused - Returns true if this dependence is confused
-    /// (the compiler understands nothing and makes worst-case
-    /// assumptions).
-    virtual bool isConfused() const { return true; }
+  /// isOutput - Returns true if this is an output dependence.
+  bool isOutput() const;
 
-    /// isConsistent - Returns true if this dependence is consistent
-    /// (occurs every time the source and destination are executed).
-    virtual bool isConsistent() const { return false; }
+  /// isFlow - Returns true if this is a flow (aka true) dependence.
+  bool isFlow() const;
 
-    /// getLevels - Returns the number of common loops surrounding the
-    /// source and destination of the dependence.
-    virtual unsigned getLevels() const { return 0; }
+  /// isAnti - Returns true if this is an anti dependence.
+  bool isAnti() const;
 
-    /// getDirection - Returns the direction associated with a particular
-    /// level.
-    virtual unsigned getDirection(unsigned Level) const { return DVEntry::ALL; }
+  /// isOrdered - Returns true if dependence is Output, Flow, or Anti
+  bool isOrdered() const { return isOutput() || isFlow() || isAnti(); }
 
-    /// getDistance - Returns the distance (or NULL) associated with a
-    /// particular level.
-    virtual const SCEV *getDistance(unsigned Level) const { return nullptr; }
+  /// isUnordered - Returns true if dependence is Input
+  bool isUnordered() const { return isInput(); }
 
-    /// Check if the direction vector is negative. A negative direction
-    /// vector means Src and Dst are reversed in the actual program.
-    virtual bool isDirectionNegative() const { return false; }
+  /// isLoopIndependent - Returns true if this is a loop-independent
+  /// dependence.
+  virtual bool isLoopIndependent() const { return true; }
 
-    /// If the direction vector is negative, normalize the direction
-    /// vector to make it non-negative. Normalization is done by reversing
-    /// Src and Dst, plus reversing the dependence directions and distances
-    /// in the vector.
-    virtual bool normalize(ScalarEvolution *SE) { return false; }
+  /// isConfused - Returns true if this dependence is confused
+  /// (the compiler understands nothing and makes worst-case assumptions).
+  virtual bool isConfused() const { return true; }
 
-    /// isPeelFirst - Returns true if peeling the first iteration from
-    /// this loop will break this dependence.
-    virtual bool isPeelFirst(unsigned Level) const { return false; }
+  /// isConsistent - Returns true if this dependence is consistent
+  /// (occurs every time the source and destination are executed).
+  virtual bool isConsistent() const { return false; }
 
-    /// isPeelLast - Returns true if peeling the last iteration from
-    /// this loop will break this dependence.
-    virtual bool isPeelLast(unsigned Level) const { return false; }
+  /// getLevels - Returns the number of common loops surrounding the
+  /// source and destination of the dependence.
+  virtual unsigned getLevels() const { return 0; }
 
-    /// isSplitable - Returns true if splitting this loop will break
-    /// the dependence.
-    virtual bool isSplitable(unsigned Level) const { return false; }
+  /// getDirection - Returns the direction associated with a particular level.
+  virtual unsigned getDirection(unsigned Level) const { return DVEntry::ALL; }
 
-    /// isScalar - Returns true if a particular level is scalar; that is,
-    /// if no subscript in the source or destination mention the induction
-    /// variable associated with the loop at this level.
-    virtual bool isScalar(unsigned Level) const;
+  /// getDistance - Returns the distance (or NULL) associated with a particular
+  /// level.
+  virtual const SCEV *getDistance(unsigned Level) const { return nullptr; }
 
-    /// getNextPredecessor - Returns the value of the NextPredecessor
-    /// field.
-    const Dependence *getNextPredecessor() const { return NextPredecessor; }
+  /// Check if the direction vector is negative. A negative direction
+  /// vector means Src and Dst are reversed in the actual program.
+  virtual bool isDirectionNegative() const { return false; }
+
+  /// If the direction vector is negative, normalize the direction
+  /// vector to make it non-negative. Normalization is done by reversing
+  /// Src and Dst, plus reversing the dependence directions and distances
+  /// in the vector.
+  virtual bool normalize(ScalarEvolution *SE) { return false; }
 
-    /// getNextSuccessor - Returns the value of the NextSuccessor
-    /// field.
-    const Dependence *getNextSuccessor() const { return NextSuccessor; }
+  /// isPeelFirst - Returns true if peeling the first iteration from
+  /// this loop will break this dependence.
+  virtual bool isPeelFirst(unsigned Level) const { return false; }
 
-    /// setNextPredecessor - Sets the value of the NextPredecessor
-    /// field.
-    void setNextPredecessor(const Dependence *pred) { NextPredecessor = pred; }
+  /// isPeelLast - Returns true if peeling the last iteration from
+  /// this loop will break this dependence.
+  virtual bool isPeelLast(unsigned Level) const { return false; }
 
-    /// setNextSuccessor - Sets the value of the NextSuccessor
-    /// field.
-    void setNextSuccessor(const Dependence *succ) { NextSuccessor = succ; }
+  /// isSplitable - Returns true if splitting this loop will break the
+  /// dependence.
+  virtual bool isSplitable(unsigned Level) const { return false; }
 
-    /// getRuntimeAssumptions - Returns the runtime assumptions under which this
-    /// Dependence relation is valid.
-    SCEVUnionPredicate getRuntimeAssumptions() const { return Assumptions; }
+  /// isScalar - Returns true if a particular level is scalar; that is,
+  /// if no subscript in the source or destination mention the induction
+  /// variable associated with the loop at this level.
+  virtual bool isScalar(unsigned Level) const;
+
+  /// getNextPredecessor - Returns the value of the NextPredecessor field.
+  const Dependence *getNextPredecessor() const { return NextPredecessor; }
+
+  /// getNextSuccessor - Returns the value of the NextSuccessor field.
+  const Dependence *getNextSuccessor() const { return NextSuccessor; }
+
+  /// setNextPredecessor - Sets the value of the NextPredecessor
+  /// field.
+  void setNextPredecessor(const Dependence *pred) { NextPredecessor = pred; }
+
+  /// setNextSuccessor - Sets the value of the NextSuccessor field.
+  void setNextSuccessor(const Dependence *succ) { NextSuccessor = succ; }
+
+  /// getRuntimeAssumptions - Returns the runtime assumptions under which this
+  /// Dependence relation is valid.
+  SCEVUnionPredicate getRuntimeAssumptions() const { return Assumptions; }
+
+  /// dump - For debugging purposes, dumps a dependence to OS.
+  void dump(raw_ostream &OS) const;
+
+protected:
+  Instruction *Src, *Dst;
+
+private:
+  SCEVUnionPredicate Assumptions;
+  const Dependence *NextPredecessor = nullptr, *NextSuccessor = nullptr;
+  friend class DependenceInfo;
+};
+
+/// FullDependence - This class represents a dependence between two memory
+/// references in a function. It contains detailed information about the
+/// dependence (direction vectors, etc.) and is used when the compiler is
+/// able to accurately analyze the interaction of the references; that is,
+/// it is not a confused dependence (see Dependence). In most cases
+/// (for output, flow, and anti dependences), the dependence implies an
+/// ordering, where the source must precede the destination; in contrast,
+/// input dependences are unordered.
+class LLVM_ABI FullDependence final : public Dependence {
+public:
+  FullDependence(Instruction *Source, Instruction *Destination,
+                 const SCEVUnionPredicate &Assumes,
+                 bool PossiblyLoopIndependent, unsigned Levels);
+
+  /// isLoopIndependent - Returns true if this is a loop-independent
+  /// dependence.
+  bool isLoopIndependent() const override { return LoopIndependent; }
+
+  /// isConfused - Returns true if this dependence is confused
+  /// (the compiler understands nothing and makes worst-case
+  /// assumptions).
+  bool isConfused() const override { return false; }
+
+  /// isConsistent - Returns true if this dependence is consistent
+  /// (occurs every time the source and destination are executed).
+  bool isConsistent() const override { return Consistent; }
+
+  /// getLevels - Returns the number of common loops surrounding the
+  /// source and destination of the dependence.
+  unsigned getLevels() const override { return Levels; }
+
+  /// getDirection - Returns the direction associated with a particular
+  /// level.
+  unsigned getDirection(unsigned Level) const override;
+
+  /// getDistance - Returns the distance (or NULL) associated with a
+  /// particular level.
+  const SCEV *getDistance(unsigned Level) const override;
+
+  /// Check if the direction vector is negative. A negative direction
+  /// vector means Src and Dst are reversed in the actual program.
+  bool isDirectionNegative() const override;
+
+  /// If the direction vector is negative, normalize the direction
+  /// vector to make it non-negative. Normalization is done by reversing
+  /// Src and Dst, plus reversing the dependence directions and distances
+  /// in the vector.
+  bool normalize(ScalarEvolution *SE) override;
+
+  /// isPeelFirst - Returns true if peeling the first iteration from
+  /// this loop will break this dependence.
+  bool isPeelFirst(unsigned Level) const override;
+
+  /// isPeelLast - Returns true if peeling the last iteration from
+  /// this loop will break this dependence.
+  bool isPeelLast(unsigned Level) const override;
+
+  /// isSplitable - Returns true if splitting the loop will break
+  /// the dependence.
+  bool isSplitable(unsigned Level) const override;
+
+  /// isScalar - Returns true if a particular level is scalar; that is,
+  /// if no subscript in the source or destination mention the induction
+  /// variable associated with the loop at this level.
+  bool isScalar(unsigned Level) const override;
+
+private:
+  unsigned short Levels;
+  bool LoopIndependent;
+  bool Consistent; // Init to true, then refine.
+  std::unique_ptr<DVEntry[]> DV;
+  friend class DependenceInfo;
+};
+
+/// DependenceInfo - This class is the main dependence-analysis driver.
+class DependenceInfo {
+public:
+  DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE, LoopInfo *LI)
+      : AA(AA), SE(SE), LI(LI), F(F) {}
+
+  /// Handle transitive invalidation when the cached analysis results go away.
+  LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
+                           FunctionAnalysisManager::Invalidator &Inv);
+
+  /// depends - Tests for a dependence between the Src and Dst instructions.
+  /// Returns NULL if no dependence; otherwise, returns a Dependence (or a
+  /// FullDependence) with as much information as can be gleaned. By default,
+  /// the dependence test collects a set of runtime assumptions that cannot be
+  /// solved at compilation time. By default UnderRuntimeAssumptions is false
+  /// for a safe approximation of the dependence relation that does not
+  /// require runtime checks.
+  LLVM_ABI std::unique_ptr<Dependence>
+  depends(Instruction *Src, Instruction *Dst,
+          bool UnderRuntimeAssumptions = false);
+
+  /// getSplitIteration - Give a dependence that's splittable at some
+  /// particular level, return the iteration that should be used to split
+  /// the loop.
+  ///
+  /// Generally, the dependence analyzer will be used to build
+  /// a dependence graph for a function (basically a map from instructions
+  /// to dependences). Looking for cycles in the graph shows us loops
+  /// that cannot be trivially vectorized/parallelized.
+  ///
+  /// We can try to improve the situation by examining all the dependences
+  /// that make up the cycle, looking for ones we can break.
+  /// Sometimes, peeling the first or last iteration of a loop will break
+  /// dependences, and there are flags for those possibilities.
+  /// Sometimes, splitting a loop at some other iteration will do the trick,
+  /// and we've got a flag for that case. Rather than waste the space to
+  /// record the exact iteration (since we rarely know), we provide
+  /// a method that calculates the iteration. It's a drag that it must work
+  /// from scratch, but wonderful in that it's possible.
+  ///
+  /// Here's an example:
+  ///
+  ///    for (i = 0; i < 10; i++)
+  ///        A[i] = ...
+  ///        ... = A[11 - i]
+  ///
+  /// There's a loop-carried flow dependence from the store to the load,
+  /// found by the weak-crossing SIV test. The dependence will have a flag,
+  /// indicating that the dependence can be broken by splitting the loop.
+  /// Calling getSplitIteration will return 5.
+  /// Splitting the loop breaks the dependence, like so:
+  ///
+  ///    for (i = 0; i <= 5; i++)
+  ///        A[i] = ...
+  ///        ... = A[11 - i]
+  ///    for (i = 6; i < 10; i++)
+  ///        A[i] = ...
+  ///        ... = A[11 - i]
+  ///
+  /// breaks the dependence and allows us to vectorize/parallelize
+  /// both loops.
+  LLVM_ABI const SCEV *getSplitIteration(const Dependence &Dep, unsigned Level);
+
+  Function *getFunction() const { return F; }
+
+  /// getRuntimeAssumptions - Returns all the runtime assumptions under which
+  /// the dependence test is valid.
+  LLVM_ABI SCEVUnionPredicate getRuntimeAssumptions() const;
+
+private:
+  AAResults *AA;
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  Function *F;
+  SmallVector<const SCEVPredicate *, 4> Assumptions;
+
+  /// Subscript - This private struct represents a pair of subscripts from
+  /// a pair of potentially multi-dimensional array references. We use a
+  /// vector of them to guide subscript partitioning.
+  struct Subscript {
+    const SCEV *Src;
+    const SCEV *Dst;
+    enum ClassificationKind { ZIV, SIV, RDIV, MIV, NonLinear } Classification;
+    SmallBitVector Loops;
+    SmallBitVector GroupLoops;
+    SmallBitVector Group;
+  };
 
-    /// dump - For debugging purposes, dumps a dependence to OS.
-    ///
-    void dump(raw_ostream &OS) const;
+  struct CoefficientInfo {
+    const SCEV *Coeff;
+    const SCEV *PosPart;
+    const SCEV *NegPart;
+    const SCEV *Iterations;
+  };
 
-  protected:
-    Instruction *Src, *Dst;
+  struct BoundInfo {
+    const SCEV *Iterations;
+    const SCEV *Upper[8];
+    const SCEV *Lower[8];
+    unsigned char Direction;
+    unsigned char DirSet;
+  };
 
+  /// Constraint - This private class represents a constraint, as defined
+  /// in the paper
+  ///
+  ///           Practical Dependence Testing
+  ///           Goff, Kennedy, Tseng
+  ///           PLDI 1991
+  ///
+  /// There are 5 kinds of constraint, in a hierarchy.
+  ///   1) Any - indicates no constraint, any dependence is possible.
+  ///   2) Line - A line ax + by = c, where a, b, and c are parameters,
+  ///             representing the dependence equation.
+  ///   3) Distance - The value d of the dependence distance;
+  ///   4) Point - A point <x, y> representing the dependence from
+  ///              iteration x to iteration y.
+  ///   5) Empty - No dependence is possible.
+  class Constraint {
   private:
-    SCEVUnionPredicate Assumptions;
-    const Dependence *NextPredecessor = nullptr, *NextSuccessor = nullptr;
-    friend class DependenceInfo;
-  };
+    enum ConstraintKind { Empty, Point, Distance, Line, Any } Kind;
+    ScalarEvolution *SE;
+    const SCEV *A;
+    const SCEV *B;
+    const SCEV *C;
+    const Loop *AssociatedLoop;
 
-  /// FullDependence - This class represents a dependence between two memory
-  /// references in a function. It contains detailed information about the
-  /// dependence (direction vectors, etc.) and is used when the compiler is
-  /// able to accurately analyze the interaction of the references; that is,
-  /// it is not a confused dependence (see Dependence). In most cases
-  /// (for output, flow, and anti dependences), the dependence implies an
-  /// ordering, where the source must precede the destination; in contrast,
-  /// input dependences are unordered.
-  class LLVM_ABI FullDependence final : public Dependence {
   public:
-    FullDependence(Instruction *Source, Instruction *Destination,
-                   const SCEVUnionPredicate &Assumes,
-                   bool PossiblyLoopIndependent, unsigned Levels);
-
-    /// isLoopIndependent - Returns true if this is a loop-independent
-    /// dependence.
-    bool isLoopIndependent() const override { return LoopIndependent; }
-
-    /// isConfused - Returns true if this dependence is confused
-    /// (the compiler understands nothing and makes worst-case
-    /// assumptions).
-    bool isConfused() const override { return false; }
-
-    /// isConsistent - Returns true if this dependence is consistent
-    /// (occurs every time the source and destination are executed).
-    bool isConsistent() const override { return Consistent; }
-
-    /// getLevels - Returns the number of common loops surrounding the
-    /// source and destination of the dependence.
-    unsigned getLevels() const override { return Levels; }
-
-    /// getDirection - Returns the direction associated with a particular
-    /// level.
-    unsigned getDirection(unsigned Level) const override;
-
-    /// getDistance - Returns the distance (or NULL) associated with a
-    /// particular level.
-    const SCEV *getDistance(unsigned Level) const override;
-
-    /// Check if the direction vector is negative. A negative direction
-    /// vector means Src and Dst are reversed in the actual program.
-    bool isDirectionNegative() const override;
-
-    /// If the direction vector is negative, normalize the direction
-    /// vector to make it non-negative. Normalization is done by reversing
-    /// Src and Dst, plus reversing the dependence directions and distances
-    /// in the vector.
-    bool normalize(ScalarEvolution *SE) override;
-
-    /// isPeelFirst - Returns true if peeling the first iteration from
-    /// this loop will break this dependence.
-    bool isPeelFirst(unsigned Level) const override;
-
-    /// isPeelLast - Returns true if peeling the last iteration from
-    /// this loop will break this dependence.
-    bool isPeelLast(unsigned Level) const override;
-
-    /// isSplitable - Returns true if splitting the loop will break
-    /// the dependence.
-    bool isSplitable(unsigned Level) const override;
-
-    /// isScalar - Returns true if a particular level is scalar; that is,
-    /// if no subscript in the source or destination mention the induction
-    /// variable associated with the loop at this level.
-    bool isScalar(unsigned Level) const override;
+    /// isEmpty - Return true if the constraint is of kind Empty.
+    bool isEmpty() const { return Kind == Empty; }
 
-  private:
-    unsigned short Levels;
-    bool LoopIndependent;
-    bool Consistent; // Init to true, then refine.
-    std::unique_ptr<DVEntry[]> DV;
-    friend class DependenceInfo;
-  };
+    /// isPoint - Return true if the constraint is of kind Point.
+    bool isPoint() const { return Kind == Point; }
 
-  /// DependenceInfo - This class is the main dependence-analysis driver.
-  ///
-  class DependenceInfo {
-  public:
-    DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE,
-                   LoopInfo *LI)
-        : AA(AA), SE(SE), LI(LI), F(F) {}
-
-    /// Handle transitive invalidation when the cached analysis results go away.
-    LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
-                             FunctionAnalysisManager::Invalidator &Inv);
-
-    /// depends - Tests for a dependence between the Src and Dst instructions.
-    /// Returns NULL if no dependence; otherwise, returns a Dependence (or a
-    /// FullDependence) with as much information as can be gleaned. By default,
-    /// the dependence test collects a set of runtime assumptions that cannot be
-    /// solved at compilation time. By default UnderRuntimeAssumptions is false
-    /// for a safe approximation of the dependence relation that does not
-    /// require runtime checks.
-    LLVM_ABI std::unique_ptr<Dependence>
-    depends(Instruction *Src, Instruction *Dst,
-            bool UnderRuntimeAssumptions = false);
-
-    /// getSplitIteration - Give a dependence that's splittable at some
-    /// particular level, return the iteration that should be used to split
-    /// the loop.
-    ///
-    /// Generally, the dependence analyzer will be used to build
-    /// a dependence graph for a function (basically a map from instructions
-    /// to dependences). Looking for cycles in the graph shows us loops
-    /// that cannot be trivially vectorized/parallelized.
-    ///
-    /// We can try to improve the situation by examining all the dependences
-    /// that make up the cycle, looking for ones we can break.
-    /// Sometimes, peeling the first or last iteration of a loop will break
-    /// dependences, and there are flags for those possibilities.
-    /// Sometimes, splitting a loop at some other iteration will do the trick,
-    /// and we've got a flag for that case. Rather than waste the space to
-    /// record the exact iteration (since we rarely know), we provide
-    /// a method that calculates the iteration. It's a drag that it must work
-    /// from scratch, but wonderful in that it's possible.
-    ///
-    /// Here's an example:
-    ///
-    ///    for (i = 0; i < 10; i++)
-    ///        A[i] = ...
-    ///        ... = A[11 - i]
-    ///
-    /// There's a loop-carried flow dependence from the store to the load,
-    /// found by the weak-crossing SIV test. The dependence will have a flag,
-    /// indicating that the dependence can be broken by splitting the loop.
-    /// Calling getSplitIteration will return 5.
-    /// Splitting the loop breaks the dependence, like so:
-    ///
-    ///    for (i = 0; i <= 5; i++)
-    ///        A[i] = ...
-    ///        ... = A[11 - i]
-    ///    for (i = 6; i < 10; i++)
-    ///        A[i] = ...
-    ///        ... = A[11 - i]
-    ///
-    /// breaks the dependence and allows us to vectorize/parallelize
-    /// both loops.
-    LLVM_ABI const SCEV *getSplitIteration(const Dependence &Dep,
-                                           unsigned Level);
-
-    Function *getFunction() const { return F; }
-
-    /// getRuntimeAssumptions - Returns all the runtime assumptions under which
-    /// the dependence test is valid.
-    LLVM_ABI SCEVUnionPredicate getRuntimeAssumptions() const;
+    /// isDistance - Return true if the constraint is of kind Distance.
+    bool isDistance() const { return Kind == Distance; }
 
-  private:
-    AAResults *AA;
-    ScalarEvolution *SE;
-    LoopInfo *LI;
-    Function *F;
-    SmallVector<const SCEVPredicate *, 4> Assumptions;
-
-    /// Subscript - This private struct represents a pair of subscripts from
-    /// a pair of potentially multi-dimensional array references. We use a
-    /// vector of them to guide subscript partitioning.
-    struct Subscript {
-      const SCEV *Src;
-      const SCEV *Dst;
-      enum ClassificationKind { ZIV, SIV, RDIV, MIV, NonLinear } Classification;
-      SmallBitVector Loops;
-      SmallBitVector GroupLoops;
-      SmallBitVector Group;
-    };
+    /// isLine - Return true if the constraint is of kind Line.
+    /// Since Distance's can also be represented as Lines, we also return
+    /// true if the constraint is of kind Distance.
+    bool isLine() const { return Kind == Line || Kind == Distance; }
 
-    struct CoefficientInfo {
-      const SCEV *Coeff;
-      const SCEV *PosPart;
-      const SCEV *NegPart;
-      const SCEV *Iterations;
-    };
+    /// isAny - Return true if the constraint is of kind Any;
+    bool isAny() const { return Kind == Any; }
 
-    struct BoundInfo {
-      const SCEV *Iterations;
-      const SCEV *Upper[8];
-      const SCEV *Lower[8];
-      unsigned char Direction;
-      unsigned char DirSet;
-    };
+    /// getX - If constraint is a point <X, Y>, returns X.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getX() const;
 
-    /// Constraint - This private class represents a constraint, as defined
-    /// in the paper
-    ///
-    ///           Practical Dependence Testing
-    ///           Goff, Kennedy, Tseng
-    ///           PLDI 1991
-    ///
-    /// There are 5 kinds of constraint, in a hierarchy.
-    ///   1) Any - indicates no constraint, any dependence is possible.
-    ///   2) Line - A line ax + by = c, where a, b, and c are parameters,
-    ///             representing the dependence equation.
-    ///   3) Distance - The value d of the dependence distance;
-    ///   4) Point - A point <x, y> representing the dependence from
-    ///              iteration x to iteration y.
-    ///   5) Empty - No dependence is possible.
-    class Constraint {
-    private:
-      enum ConstraintKind { Empty, Point, Distance, Line, Any } Kind;
-      ScalarEvolution *SE;
-      const SCEV *A;
-      const SCEV *B;
-      const SCEV *C;
-      const Loop *AssociatedLoop;
-
-    public:
-      /// isEmpty - Return true if the constraint is of kind Empty.
-      bool isEmpty() const { return Kind == Empty; }
-
-      /// isPoint - Return true if the constraint is of kind Point.
-      bool isPoint() const { return Kind == Point; }
-
-      /// isDistance - Return true if the constraint is of kind Distance.
-      bool isDistance() const { return Kind == Distance; }
-
-      /// isLine - Return true if the constraint is of kind Line.
-      /// Since Distance's can also be represented as Lines, we also return
-      /// true if the constraint is of kind Distance.
-      bool isLine() const { return Kind == Line || Kind == Distance; }
-
-      /// isAny - Return true if the constraint is of kind Any;
-      bool isAny() const { return Kind == Any; }
-
-      /// getX - If constraint is a point <X, Y>, returns X.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getX() const;
-
-      /// getY - If constraint is a point <X, Y>, returns Y.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getY() const;
-
-      /// getA - If constraint is a line AX + BY = C, returns A.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getA() const;
-
-      /// getB - If constraint is a line AX + BY = C, returns B.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getB() const;
-
-      /// getC - If constraint is a line AX + BY = C, returns C.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getC() const;
-
-      /// getD - If constraint is a distance, returns D.
-      /// Otherwise assert.
-      LLVM_ABI const SCEV *getD() const;
-
-      /// getAssociatedLoop - Returns the loop associated with this constraint.
-      LLVM_ABI const Loop *getAssociatedLoop() const;
-
-      /// setPoint - Change a constraint to Point.
-      LLVM_ABI void setPoint(const SCEV *X, const SCEV *Y,
-                             const Loop *CurrentLoop);
-
-      /// setLine - Change a constraint to Line.
-      LLVM_ABI void setLine(const SCEV *A, const SCEV *B, const SCEV *C,
-                            const Loop *CurrentLoop);
-
-      /// setDistance - Change a constraint to Distance.
-      LLVM_ABI void setDistance(const SCEV *D, const Loop *CurrentLoop);
-
-      /// setEmpty - Change a constraint to Empty.
-      LLVM_ABI void setEmpty();
-
-      /// setAny - Change a constraint to Any.
-      LLVM_ABI void setAny(ScalarEvolution *SE);
-
-      /// dump - For debugging purposes. Dumps the constraint
-      /// out to OS.
-      LLVM_ABI void dump(raw_ostream &OS) const;
-    };
+    /// getY - If constraint is a point <X, Y>, returns Y.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getY() const;
 
-    /// establishNestingLevels - Examines the loop nesting of the Src and Dst
-    /// instructions and establishes their shared loops. Sets the variables
-    /// CommonLevels, SrcLevels, and MaxLevels.
-    /// The source and destination instructions needn't be contained in the same
-    /// loop. The routine establishNestingLevels finds the level of most deeply
-    /// nested loop that contains them both, CommonLevels. An instruction that's
-    /// not contained in a loop is at level = 0. MaxLevels is equal to the level
-    /// of the source plus the level of the destination, minus CommonLevels.
-    /// This lets us allocate vectors MaxLevels in length, with room for every
-    /// distinct loop referenced in both the source and destination subscripts.
-    /// The variable SrcLevels is the nesting depth of the source instruction.
-    /// It's used to help calculate distinct loops referenced by the destination.
-    /// Here's the map from loops to levels:
-    ///            0 - unused
-    ///            1 - outermost common loop
-    ///          ... - other common loops
-    /// CommonLevels - innermost common loop
-    ///          ... - loops containing Src but not Dst
-    ///    SrcLevels - innermost loop containing Src but not Dst
-    ///          ... - loops containing Dst but not Src
-    ///    MaxLevels - innermost loop containing Dst but not Src
-    /// Consider the follow code fragment:
-    ///    for (a = ...) {
-    ///      for (b = ...) {
-    ///        for (c = ...) {
-    ///          for (d = ...) {
-    ///            A[] = ...;
-    ///          }
-    ///        }
-    ///        for (e = ...) {
-    ///          for (f = ...) {
-    ///            for (g = ...) {
-    ///              ... = A[];
-    ///            }
-    ///          }
-    ///        }
-    ///      }
-    ///    }
-    /// If we're looking at the possibility of a dependence between the store
-    /// to A (the Src) and the load from A (the Dst), we'll note that they
-    /// have 2 loops in common, so CommonLevels will equal 2 and the direction
-    /// vector for Result will have 2 entries. SrcLevels = 4 and MaxLevels = 7.
-    /// A map from loop names to level indices would look like
-    ///     a - 1
-    ///     b - 2 = CommonLevels
-    ///     c - 3
-    ///     d - 4 = SrcLevels
-    ///     e - 5
-    ///     f - 6
-    ///     g - 7 = MaxLevels
-    void establishNestingLevels(const Instruction *Src,
-                                const Instruction *Dst);
-
-    unsigned CommonLevels, SrcLevels, MaxLevels;
-
-    /// mapSrcLoop - Given one of the loops containing the source, return
-    /// its level index in our numbering scheme.
-    unsigned mapSrcLoop(const Loop *SrcLoop) const;
-
-    /// mapDstLoop - Given one of the loops containing the destination,
-    /// return its level index in our numbering scheme.
-    unsigned mapDstLoop(const Loop *DstLoop) const;
-
-    /// isLoopInvariant - Returns true if Expression is loop invariant
-    /// in LoopNest.
-    bool isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const;
-
-    /// Makes sure all subscript pairs share the same integer type by
-    /// sign-extending as necessary.
-    /// Sign-extending a subscript is safe because getelementptr assumes the
-    /// array subscripts are signed.
-    void unifySubscriptType(ArrayRef<Subscript *> Pairs);
-
-    /// removeMatchingExtensions - Examines a subscript pair.
-    /// If the source and destination are identically sign (or zero)
-    /// extended, it strips off the extension in an effort to
-    /// simplify the actual analysis.
-    void removeMatchingExtensions(Subscript *Pair);
-
-    /// collectCommonLoops - Finds the set of loops from the LoopNest that
-    /// have a level <= CommonLevels and are referred to by the SCEV Expression.
-    void collectCommonLoops(const SCEV *Expression,
-                            const Loop *LoopNest,
-                            SmallBitVector &Loops) const;
-
-    /// checkSrcSubscript - Examines the SCEV Src, returning true iff it's
-    /// linear. Collect the set of loops mentioned by Src.
-    bool checkSrcSubscript(const SCEV *Src,
-                           const Loop *LoopNest,
-                           SmallBitVector &Loops);
-
-    /// checkDstSubscript - Examines the SCEV Dst, returning true iff it's
-    /// linear. Collect the set of loops mentioned by Dst.
-    bool checkDstSubscript(const SCEV *Dst,
-                           const Loop *LoopNest,
-                           SmallBitVector &Loops);
-
-    /// isKnownPredicate - Compare X and Y using the predicate Pred.
-    /// Basically a wrapper for SCEV::isKnownPredicate,
-    /// but tries harder, especially in the presence of sign and zero
-    /// extensions and symbolics.
-    bool isKnownPredicate(ICmpInst::Predicate Pred,
-                          const SCEV *X,
-                          const SCEV *Y) const;
-
-    /// isKnownLessThan - Compare to see if S is less than Size
-    /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra
-    /// checking if S is an AddRec and we can prove lessthan using the loop
-    /// bounds.
-    bool isKnownLessThan(const SCEV *S, const SCEV *Size) const;
-
-    /// isKnownNonNegative - Compare to see if S is known not to be negative
-    /// Uses the fact that S comes from Ptr, which may be an inbound GEP,
-    /// Proving there is no wrapping going on.
-    bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const;
-
-    /// collectUpperBound - All subscripts are the same type (on my machine,
-    /// an i64). The loop bound may be a smaller type. collectUpperBound
-    /// find the bound, if available, and zero extends it to the Type T.
-    /// (I zero extend since the bound should always be >= 0.)
-    /// If no upper bound is available, return NULL.
-    const SCEV *collectUpperBound(const Loop *l, Type *T) const;
-
-    /// collectConstantUpperBound - Calls collectUpperBound(), then
-    /// attempts to cast it to SCEVConstant. If the cast fails,
-    /// returns NULL.
-    const SCEVConstant *collectConstantUpperBound(const Loop *l, Type *T) const;
-
-    /// classifyPair - Examines the subscript pair (the Src and Dst SCEVs)
-    /// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
-    /// Collects the associated loops in a set.
-    Subscript::ClassificationKind classifyPair(const SCEV *Src,
-                                           const Loop *SrcLoopNest,
-                                           const SCEV *Dst,
-                                           const Loop *DstLoopNest,
-                                           SmallBitVector &Loops);
-
-    /// testZIV - Tests the ZIV subscript pair (Src and Dst) for dependence.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// If the dependence isn't proven to exist,
-    /// marks the Result as inconsistent.
-    bool testZIV(const SCEV *Src,
-                 const SCEV *Dst,
-                 FullDependence &Result) const;
-
-    /// testSIV - Tests the SIV subscript pair (Src and Dst) for dependence.
-    /// Things of the form [c1 + a1*i] and [c2 + a2*j], where
-    /// i and j are induction variables, c1 and c2 are loop invariant,
-    /// and a1 and a2 are constant.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction vector entry and, when possible,
-    /// the distance vector entry.
-    /// If the dependence isn't proven to exist,
-    /// marks the Result as inconsistent.
-    bool testSIV(const SCEV *Src,
-                 const SCEV *Dst,
-                 unsigned &Level,
-                 FullDependence &Result,
-                 Constraint &NewConstraint,
-                 const SCEV *&SplitIter) const;
-
-    /// testRDIV - Tests the RDIV subscript pair (Src and Dst) for dependence.
-    /// Things of the form [c1 + a1*i] and [c2 + a2*j]
-    /// where i and j are induction variables, c1 and c2 are loop invariant,
-    /// and a1 and a2 are constant.
-    /// With minor algebra, this test can also be used for things like
-    /// [c1 + a1*i + a2*j][c2].
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Marks the Result as inconsistent.
-    bool testRDIV(const SCEV *Src,
-                  const SCEV *Dst,
-                  FullDependence &Result) const;
+    /// getA - If constraint is a line AX + BY = C, returns A.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getA() const;
 
-    /// testMIV - Tests the MIV subscript pair (Src and Dst) for dependence.
-    /// Returns true if dependence disproved.
-    /// Can sometimes refine direction vectors.
-    bool testMIV(const SCEV *Src,
-                 const SCEV *Dst,
-                 const SmallBitVector &Loops,
-                 FullDependence &Result) const;
-
-    /// strongSIVtest - Tests the strong SIV subscript pair (Src and Dst)
-    /// for dependence.
-    /// Things of the form [c1 + a*i] and [c2 + a*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction and distance.
-    bool strongSIVtest(const SCEV *Coeff,
-                       const SCEV *SrcConst,
-                       const SCEV *DstConst,
-                       const Loop *CurrentLoop,
-                       unsigned Level,
-                       FullDependence &Result,
-                       Constraint &NewConstraint) const;
-
-    /// weakCrossingSIVtest - Tests the weak-crossing SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1 + a*i] and [c2 - a*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    /// Marks the dependence as splitable.
-    bool weakCrossingSIVtest(const SCEV *SrcCoeff,
-                             const SCEV *SrcConst,
-                             const SCEV *DstConst,
-                             const Loop *CurrentLoop,
-                             unsigned Level,
-                             FullDependence &Result,
-                             Constraint &NewConstraint,
-                             const SCEV *&SplitIter) const;
-
-    /// ExactSIVtest - Tests the SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1 + a1*i] and [c2 + a2*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a1 and a2 are constant.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    bool exactSIVtest(const SCEV *SrcCoeff,
-                      const SCEV *DstCoeff,
-                      const SCEV *SrcConst,
-                      const SCEV *DstConst,
-                      const Loop *CurrentLoop,
-                      unsigned Level,
-                      FullDependence &Result,
-                      Constraint &NewConstraint) const;
-
-    /// weakZeroSrcSIVtest - Tests the weak-zero SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1] and [c2 + a*i],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant. See also weakZeroDstSIVtest.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    /// If loop peeling will break the dependence, mark appropriately.
-    bool weakZeroSrcSIVtest(const SCEV *DstCoeff,
-                            const SCEV *SrcConst,
-                            const SCEV *DstConst,
-                            const Loop *CurrentLoop,
-                            unsigned Level,
-                            FullDependence &Result,
-                            Constraint &NewConstraint) const;
-
-    /// weakZeroDstSIVtest - Tests the weak-zero SIV subscript pair
-    /// (Src and Dst) for dependence.
-    /// Things of the form [c1 + a*i] and [c2],
-    /// where i is an induction variable, c1 and c2 are loop invariant,
-    /// and a is a constant. See also weakZeroSrcSIVtest.
-    /// Returns true if any possible dependence is disproved.
-    /// If there might be a dependence, returns false.
-    /// Sets appropriate direction entry.
-    /// Set consistent to false.
-    /// If loop peeling will break the dependence, mark appropriately.
-    bool weakZeroDstSIVtest(const SCEV *SrcCoeff,
-                            const SCEV *SrcConst,
-                            const SCEV *DstConst,
-                            const Loop *CurrentLoop,
-                            unsigned Level,
-                            FullDependence &Result,
-                            Constraint &NewConstraint) const;
-
-    /// exactRDIVtest - Tests the RDIV subscript pair for dependence.
-    /// Things of the form [c1 + a*i] and [c2 + b*j],
-    /// where i and j are induction variable, c1 and c2 are loop invariant,
-    /// and a and b are constants.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Works in some cases that symbolicRDIVtest doesn't,
-    /// and vice versa.
-    bool exactRDIVtest(const SCEV *SrcCoeff,
-                       const SCEV *DstCoeff,
-                       const SCEV *SrcConst,
-                       const SCEV *DstConst,
-                       const Loop *SrcLoop,
-                       const Loop *DstLoop,
-                       FullDependence &Result) const;
+    /// getB - If constraint is a line AX + BY = C, returns B.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getB() const;
 
-    /// symbolicRDIVtest - Tests the RDIV subscript pair for dependence.
-    /// Things of the form [c1 + a*i] and [c2 + b*j],
-    /// where i and j are induction variable, c1 and c2 are loop invariant,
-    /// and a and b are constants.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Works in some cases that exactRDIVtest doesn't,
-    /// and vice versa. Can also be used as a backup for
-    /// ordinary SIV tests.
-    bool symbolicRDIVtest(const SCEV *SrcCoeff,
-                          const SCEV *DstCoeff,
-                          const SCEV *SrcConst,
-                          const SCEV *DstConst,
-                          const Loop *SrcLoop,
-                          const Loop *DstLoop) const;
-
-    /// gcdMIVtest - Tests an MIV subscript pair for dependence.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Can sometimes disprove the equal direction for 1 or more loops.
-    //  Can handle some symbolics that even the SIV tests don't get,
-    /// so we use it as a backup for everything.
-    bool gcdMIVtest(const SCEV *Src,
-                    const SCEV *Dst,
-                    FullDependence &Result) const;
-
-    /// banerjeeMIVtest - Tests an MIV subscript pair for dependence.
-    /// Returns true if any possible dependence is disproved.
-    /// Marks the result as inconsistent.
-    /// Computes directions.
-    bool banerjeeMIVtest(const SCEV *Src,
-                         const SCEV *Dst,
-                         const SmallBitVector &Loops,
-                         FullDependence &Result) const;
-
-    /// collectCoefficientInfo - Walks through the subscript,
-    /// collecting each coefficient, the associated loop bounds,
-    /// and recording its positive and negative parts for later use.
-    CoefficientInfo *collectCoeffInfo(const SCEV *Subscript,
-                                      bool SrcFlag,
-                                      const SCEV *&Constant) const;
-
-    /// getPositivePart - X^+ = max(X, 0).
-    ///
-    const SCEV *getPositivePart(const SCEV *X) const;
-
-    /// getNegativePart - X^- = min(X, 0).
-    ///
-    const SCEV *getNegativePart(const SCEV *X) const;
-
-    /// getLowerBound - Looks through all the bounds info and
-    /// computes the lower bound given the current direction settings
-    /// at each level.
-    const SCEV *getLowerBound(BoundInfo *Bound) const;
-
-    /// getUpperBound - Looks through all the bounds info and
-    /// computes the upper bound given the current direction settings
-    /// at each level.
-    const SCEV *getUpperBound(BoundInfo *Bound) const;
-
-    /// exploreDirections - Hierarchically expands the direction vector
-    /// search space, combining the directions of discovered dependences
-    /// in the DirSet field of Bound. Returns the number of distinct
-    /// dependences discovered. If the dependence is disproved,
-    /// it will return 0.
-    unsigned exploreDirections(unsigned Level,
-                               CoefficientInfo *A,
-                               CoefficientInfo *B,
-                               BoundInfo *Bound,
-                               const SmallBitVector &Loops,
-                               unsigned &DepthExpanded,
-                               const SCEV *Delta) const;
-
-    /// testBounds - Returns true iff the current bounds are plausible.
-    bool testBounds(unsigned char DirKind,
-                    unsigned Level,
-                    BoundInfo *Bound,
-                    const SCEV *Delta) const;
-
-    /// findBoundsALL - Computes the upper and lower bounds for level K
-    /// using the * direction. Records them in Bound.
-    void findBoundsALL(CoefficientInfo *A,
-                       CoefficientInfo *B,
-                       BoundInfo *Bound,
-                       unsigned K) const;
-
-    /// findBoundsLT - Computes the upper and lower bounds for level K
-    /// using the < direction. Records them in Bound.
-    void findBoundsLT(CoefficientInfo *A,
-                      CoefficientInfo *B,
-                      BoundInfo *Bound,
-                      unsigned K) const;
-
-    /// findBoundsGT - Computes the upper and lower bounds for level K
-    /// using the > direction. Records them in Bound.
-    void findBoundsGT(CoefficientInfo *A,
-                      CoefficientInfo *B,
-                      BoundInfo *Bound,
-                      unsigned K) const;
-
-    /// findBoundsEQ - Computes the upper and lower bounds for level K
-    /// using the = direction. Records them in Bound.
-    void findBoundsEQ(CoefficientInfo *A,
-                      CoefficientInfo *B,
-                      BoundInfo *Bound,
-                      unsigned K) const;
-
-    /// intersectConstraints - Updates X with the intersection
-    /// of the Constraints X and Y. Returns true if X has changed.
-    bool intersectConstraints(Constraint *X,
-                              const Constraint *Y);
-
-    /// propagate - Review the constraints, looking for opportunities
-    /// to simplify a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    /// If the simplification isn't exact (that is, if it is conservative
-    /// in terms of dependence), set consistent to false.
-    bool propagate(const SCEV *&Src,
-                   const SCEV *&Dst,
-                   SmallBitVector &Loops,
-                   SmallVectorImpl<Constraint> &Constraints,
-                   bool &Consistent);
-
-    /// propagateDistance - Attempt to propagate a distance
-    /// constraint into a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    /// If the simplification isn't exact (that is, if it is conservative
-    /// in terms of dependence), set consistent to false.
-    bool propagateDistance(const SCEV *&Src,
-                           const SCEV *&Dst,
-                           Constraint &CurConstraint,
-                           bool &Consistent);
-
-    /// propagatePoint - Attempt to propagate a point
-    /// constraint into a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    bool propagatePoint(const SCEV *&Src,
-                        const SCEV *&Dst,
-                        Constraint &CurConstraint);
-
-    /// propagateLine - Attempt to propagate a line
-    /// constraint into a subscript pair (Src and Dst).
-    /// Return true if some simplification occurs.
-    /// If the simplification isn't exact (that is, if it is conservative
-    /// in terms of dependence), set consistent to false.
-    bool propagateLine(const SCEV *&Src,
-                       const SCEV *&Dst,
-                       Constraint &CurConstraint,
-                       bool &Consistent);
-
-    /// findCoefficient - Given a linear SCEV,
-    /// return the coefficient corresponding to specified loop.
-    /// If there isn't one, return the SCEV constant 0.
-    /// For example, given a*i + b*j + c*k, returning the coefficient
-    /// corresponding to the j loop would yield b.
-    const SCEV *findCoefficient(const SCEV *Expr,
-                                const Loop *TargetLoop) const;
-
-    /// zeroCoefficient - Given a linear SCEV,
-    /// return the SCEV given by zeroing out the coefficient
-    /// corresponding to the specified loop.
-    /// For example, given a*i + b*j + c*k, zeroing the coefficient
-    /// corresponding to the j loop would yield a*i + c*k.
-    const SCEV *zeroCoefficient(const SCEV *Expr,
-                                const Loop *TargetLoop) const;
-
-    /// addToCoefficient - Given a linear SCEV Expr,
-    /// return the SCEV given by adding some Value to the
-    /// coefficient corresponding to the specified TargetLoop.
-    /// For example, given a*i + b*j + c*k, adding 1 to the coefficient
-    /// corresponding to the j loop would yield a*i + (b+1)*j + c*k.
-    const SCEV *addToCoefficient(const SCEV *Expr,
-                                 const Loop *TargetLoop,
-                                 const SCEV *Value)  const;
-
-    /// updateDirection - Update direction vector entry
-    /// based on the current constraint.
-    void updateDirection(Dependence::DVEntry &Level,
-                         const Constraint &CurConstraint) const;
-
-    /// Given a linear access function, tries to recover subscripts
-    /// for each dimension of the array element access.
-    bool tryDelinearize(Instruction *Src, Instruction *Dst,
-                        SmallVectorImpl<Subscript> &Pair);
-
-    /// Tries to delinearize \p Src and \p Dst access functions for a fixed size
-    /// multi-dimensional array. Calls tryDelinearizeFixedSizeImpl() to
-    /// delinearize \p Src and \p Dst separately,
-    bool tryDelinearizeFixedSize(Instruction *Src, Instruction *Dst,
-                                 const SCEV *SrcAccessFn,
-                                 const SCEV *DstAccessFn,
-                                 SmallVectorImpl<const SCEV *> &SrcSubscripts,
-                                 SmallVectorImpl<const SCEV *> &DstSubscripts);
-
-    /// Tries to delinearize access function for a multi-dimensional array with
-    /// symbolic runtime sizes.
-    /// Returns true upon success and false otherwise.
-    bool tryDelinearizeParametricSize(
-        Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn,
-        const SCEV *DstAccessFn, SmallVectorImpl<const SCEV *> &SrcSubscripts,
-        SmallVectorImpl<const SCEV *> &DstSubscripts);
-
-    /// checkSubscript - Helper function for checkSrcSubscript and
-    /// checkDstSubscript to avoid duplicate code
-    bool checkSubscript(const SCEV *Expr, const Loop *LoopNest,
-                        SmallBitVector &Loops, bool IsSrc);
-  }; // class DependenceInfo
-
-  /// AnalysisPass to compute dependence information in a function
-  class DependenceAnalysis : public AnalysisInfoMixin<DependenceAnalysis> {
-  public:
-    typedef DependenceInfo Result;
-    LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM);
+    /// getC - If constraint is a line AX + BY = C, returns C.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getC() const;
 
-  private:
-    LLVM_ABI static AnalysisKey Key;
-    friend struct AnalysisInfoMixin<DependenceAnalysis>;
-  }; // class DependenceAnalysis
+    /// getD - If constraint is a distance, returns D.
+    /// Otherwise assert.
+    LLVM_ABI const SCEV *getD() const;
 
-  /// Printer pass to dump DA results.
-  struct DependenceAnalysisPrinterPass
-      : public PassInfoMixin<DependenceAnalysisPrinterPass> {
-    DependenceAnalysisPrinterPass(raw_ostream &OS,
-                                  bool NormalizeResults = false)
-        : OS(OS), NormalizeResults(NormalizeResults) {}
+    /// getAssociatedLoop - Returns the loop associated with this constraint.
+    LLVM_ABI const Loop *getAssociatedLoop() const;
 
-    LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+    /// setPoint - Change a constraint to Point.
+    LLVM_ABI void setPoint(const SCEV *X, const SCEV *Y,
+                           const Loop *CurrentLoop);
 
-    static bool isRequired() { return true; }
+    /// setLine - Change a constraint to Line.
+    LLVM_ABI void setLine(const SCEV *A, const SCEV *B, const SCEV *C,
+                          const Loop *CurrentLoop);
 
-  private:
-    raw_ostream &OS;
-    bool NormalizeResults;
-  }; // class DependenceAnalysisPrinterPass
+    /// setDistance - Change a constraint to Distance.
+    LLVM_ABI void setDistance(const SCEV *D, const Loop *CurrentLoop);
 
-  /// Legacy pass manager pass to access dependence information
-  class LLVM_ABI DependenceAnalysisWrapperPass : public FunctionPass {
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    DependenceAnalysisWrapperPass();
+    /// setEmpty - Change a constraint to Empty.
+    LLVM_ABI void setEmpty();
 
-    bool runOnFunction(Function &F) override;
-    void releaseMemory() override;
-    void getAnalysisUsage(AnalysisUsage &) const override;
-    void print(raw_ostream &, const Module * = nullptr) const override;
-    DependenceInfo &getDI() const;
+    /// setAny - Change a constraint to Any.
+    LLVM_ABI void setAny(ScalarEvolution *SE);
 
-  private:
-    std::unique_ptr<DependenceInfo> info;
-  }; // class DependenceAnalysisWrapperPass
+    /// dump - For debugging purposes. Dumps the constraint
+    /// out to OS.
+    LLVM_ABI void dump(raw_ostream &OS) const;
+  };
+
+  /// establishNestingLevels - Examines the loop nesting of the Src and Dst
+  /// instructions and establishes their shared loops. Sets the variables
+  /// CommonLevels, SrcLevels, and MaxLevels.
+  /// The source and destination instructions needn't be contained in the same
+  /// loop. The routine establishNestingLevels finds the level of most deeply
+  /// nested loop that contains them both, CommonLevels. An instruction that's
+  /// not contained in a loop is at level = 0. MaxLevels is equal to the level
+  /// of the source plus the level of the destination, minus CommonLevels.
+  /// This lets us allocate vectors MaxLevels in length, with room for every
+  /// distinct loop referenced in both the source and destination subscripts.
+  /// The variable SrcLevels is the nesting depth of the source instruction.
+  /// It's used to help calculate distinct loops referenced by the destination.
+  /// Here's the map from loops to levels:
+  ///            0 - unused
+  ///            1 - outermost common loop
+  ///          ... - other common loops
+  /// CommonLevels - innermost common loop
+  ///          ... - loops containing Src but not Dst
+  ///    SrcLevels - innermost loop containing Src but not Dst
+  ///          ... - loops containing Dst but not Src
+  ///    MaxLevels - innermost loop containing Dst but not Src
+  /// Consider the follow code fragment:
+  ///    for (a = ...) {
+  ///      for (b = ...) {
+  ///        for (c = ...) {
+  ///          for (d = ...) {
+  ///            A[] = ...;
+  ///          }
+  ///        }
+  ///        for (e = ...) {
+  ///          for (f = ...) {
+  ///            for (g = ...) {
+  ///              ... = A[];
+  ///            }
+  ///          }
+  ///        }
+  ///      }
+  ///    }
+  /// If we're looking at the possibility of a dependence between the store
+  /// to A (the Src) and the load from A (the Dst), we'll note that they
+  /// have 2 loops in common, so CommonLevels will equal 2 and the direction
+  /// vector for Result will have 2 entries. SrcLevels = 4 and MaxLevels = 7.
+  /// A map from loop names to level indices would look like
+  ///     a - 1
+  ///     b - 2 = CommonLevels
+  ///     c - 3
+  ///     d - 4 = SrcLevels
+  ///     e - 5
+  ///     f - 6
+  ///     g - 7 = MaxLevels
+  void establishNestingLevels(const Instruction *Src, const Instruction *Dst);
+
+  unsigned CommonLevels, SrcLevels, MaxLevels;
+
+  /// mapSrcLoop - Given one of the loops containing the source, return
+  /// its level index in our numbering scheme.
+  unsigned mapSrcLoop(const Loop *SrcLoop) const;
+
+  /// mapDstLoop - Given one of the loops containing the destination,
+  /// return its level index in our numbering scheme.
+  unsigned mapDstLoop(const Loop *DstLoop) const;
+
+  /// isLoopInvariant - Returns true if Expression is loop invariant
+  /// in LoopNest.
+  bool isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const;
+
+  /// Makes sure all subscript pairs share the same integer type by
+  /// sign-extending as necessary.
+  /// Sign-extending a subscript is safe because getelementptr assumes the
+  /// array subscripts are signed.
+  void unifySubscriptType(ArrayRef<Subscript *> Pairs);
+
+  /// removeMatchingExtensions - Examines a subscript pair.
+  /// If the source and destination are identically sign (or zero)
+  /// extended, it strips off the extension in an effort to
+  /// simplify the actual analysis.
+  void removeMatchingExtensions(Subscript *Pair);
+
+  /// collectCommonLoops - Finds the set of loops from the LoopNest that
+  /// have a level <= CommonLevels and are referred to by the SCEV Expression.
+  void collectCommonLoops(const SCEV *Expression, const Loop *LoopNest,
+                          SmallBitVector &Loops) const;
+
+  /// checkSrcSubscript - Examines the SCEV Src, returning true iff it's
+  /// linear. Collect the set of loops mentioned by Src.
+  bool checkSrcSubscript(const SCEV *Src, const Loop *LoopNest,
+                         SmallBitVector &Loops);
+
+  /// checkDstSubscript - Examines the SCEV Dst, returning true iff it's
+  /// linear. Collect the set of loops mentioned by Dst.
+  bool checkDstSubscript(const SCEV *Dst, const Loop *LoopNest,
+                         SmallBitVector &Loops);
+
+  /// isKnownPredicate - Compare X and Y using the predicate Pred.
+  /// Basically a wrapper for SCEV::isKnownPredicate,
+  /// but tries harder, especially in the presence of sign and zero
+  /// extensions and symbolics.
+  bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
+                        const SCEV *Y) const;
+
+  /// isKnownLessThan - Compare to see if S is less than Size
+  /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra
+  /// checking if S is an AddRec and we can prove lessthan using the loop
+  /// bounds.
+  bool isKnownLessThan(const SCEV *S, const SCEV *Size) const;
+
+  /// isKnownNonNegative - Compare to see if S is known not to be negative
+  /// Uses the fact that S comes from Ptr, which may be an inbound GEP,
+  /// Proving there is no wrapping going on.
+  bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const;
+
+  /// collectUpperBound - All subscripts are the same type (on my machine,
+  /// an i64). The loop bound may be a smaller type. collectUpperBound
+  /// find the bound, if available, and zero extends it to the Type T.
+  /// (I zero extend since the bound should always be >= 0.)
+  /// If no upper bound is available, return NULL.
+  const SCEV *collectUpperBound(const Loop *l, Type *T) const;
+
+  /// collectConstantUpperBound - Calls collectUpperBound(), then
+  /// attempts to cast it to SCEVConstant. If the cast fails,
+  /// returns NULL.
+  const SCEVConstant *collectConstantUpperBound(const Loop *l, Type *T) const;
+
+  /// classifyPair - Examines the subscript pair (the Src and Dst SCEVs)
+  /// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
+  /// Collects the associated loops in a set.
+  Subscript::ClassificationKind
+  classifyPair(const SCEV *Src, const Loop *SrcLoopNest, const SCEV *Dst,
+               const Loop *DstLoopNest, SmallBitVector &Loops);
+
+  /// testZIV - Tests the ZIV subscript pair (Src and Dst) for dependence.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// If the dependence isn't proven to exist,
+  /// marks the Result as inconsistent.
+  bool testZIV(const SCEV *Src, const SCEV *Dst, FullDependence &Result) const;
+
+  /// testSIV - Tests the SIV subscript pair (Src and Dst) for dependence.
+  /// Things of the form [c1 + a1*i] and [c2 + a2*j], where
+  /// i and j are induction variables, c1 and c2 are loop invariant,
+  /// and a1 and a2 are constant.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction vector entry and, when possible,
+  /// the distance vector entry.
+  /// If the dependence isn't proven to exist,
+  /// marks the Result as inconsistent.
+  bool testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
+               FullDependence &Result, Constraint &NewConstraint,
+               const SCEV *&SplitIter) const;
+
+  /// testRDIV - Tests the RDIV subscript pair (Src and Dst) for dependence.
+  /// Things of the form [c1 + a1*i] and [c2 + a2*j]
+  /// where i and j are induction variables, c1 and c2 are loop invariant,
+  /// and a1 and a2 are constant.
+  /// With minor algebra, this test can also be used for things like
+  /// [c1 + a1*i + a2*j][c2].
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Marks the Result as inconsistent.
+  bool testRDIV(const SCEV *Src, const SCEV *Dst, FullDependence &Result) const;
+
+  /// testMIV - Tests the MIV subscript pair (Src and Dst) for dependence.
+  /// Returns true if dependence disproved.
+  /// Can sometimes refine direction vectors.
+  bool testMIV(const SCEV *Src, const SCEV *Dst, const SmallBitVector &Loops,
+               FullDependence &Result) const;
+
+  /// strongSIVtest - Tests the strong SIV subscript pair (Src and Dst)
+  /// for dependence.
+  /// Things of the form [c1 + a*i] and [c2 + a*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction and distance.
+  bool strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
+                     const SCEV *DstConst, const Loop *CurrentLoop,
+                     unsigned Level, FullDependence &Result,
+                     Constraint &NewConstraint) const;
+
+  /// weakCrossingSIVtest - Tests the weak-crossing SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1 + a*i] and [c2 - a*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  /// Marks the dependence as splitable.
+  bool weakCrossingSIVtest(const SCEV *SrcCoeff, const SCEV *SrcConst,
+                           const SCEV *DstConst, const Loop *CurrentLoop,
+                           unsigned Level, FullDependence &Result,
+                           Constraint &NewConstraint,
+                           const SCEV *&SplitIter) const;
+
+  /// ExactSIVtest - Tests the SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1 + a1*i] and [c2 + a2*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a1 and a2 are constant.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  bool exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                    const SCEV *SrcConst, const SCEV *DstConst,
+                    const Loop *CurrentLoop, unsigned Level,
+                    FullDependence &Result, Constraint &NewConstraint) const;
+
+  /// weakZeroSrcSIVtest - Tests the weak-zero SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1] and [c2 + a*i],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant. See also weakZeroDstSIVtest.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  /// If loop peeling will break the dependence, mark appropriately.
+  bool weakZeroSrcSIVtest(const SCEV *DstCoeff, const SCEV *SrcConst,
+                          const SCEV *DstConst, const Loop *CurrentLoop,
+                          unsigned Level, FullDependence &Result,
+                          Constraint &NewConstraint) const;
+
+  /// weakZeroDstSIVtest - Tests the weak-zero SIV subscript pair
+  /// (Src and Dst) for dependence.
+  /// Things of the form [c1 + a*i] and [c2],
+  /// where i is an induction variable, c1 and c2 are loop invariant,
+  /// and a is a constant. See also weakZeroSrcSIVtest.
+  /// Returns true if any possible dependence is disproved.
+  /// If there might be a dependence, returns false.
+  /// Sets appropriate direction entry.
+  /// Set consistent to false.
+  /// If loop peeling will break the dependence, mark appropriately.
+  bool weakZeroDstSIVtest(const SCEV *SrcCoeff, const SCEV *SrcConst,
+                          const SCEV *DstConst, const Loop *CurrentLoop,
+                          unsigned Level, FullDependence &Result,
+                          Constraint &NewConstraint) const;
+
+  /// exactRDIVtest - Tests the RDIV subscript pair for dependence.
+  /// Things of the form [c1 + a*i] and [c2 + b*j],
+  /// where i and j are induction variable, c1 and c2 are loop invariant,
+  /// and a and b are constants.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Works in some cases that symbolicRDIVtest doesn't,
+  /// and vice versa.
+  bool exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                     const SCEV *SrcConst, const SCEV *DstConst,
+                     const Loop *SrcLoop, const Loop *DstLoop,
+                     FullDependence &Result) const;
+
+  /// symbolicRDIVtest - Tests the RDIV subscript pair for dependence.
+  /// Things of the form [c1 + a*i] and [c2 + b*j],
+  /// where i and j are induction variable, c1 and c2 are loop invariant,
+  /// and a and b are constants.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Works in some cases that exactRDIVtest doesn't,
+  /// and vice versa. Can also be used as a backup for
+  /// ordinary SIV tests.
+  bool symbolicRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                        const SCEV *SrcConst, const SCEV *DstConst,
+                        const Loop *SrcLoop, const Loop *DstLoop) const;
+
+  /// gcdMIVtest - Tests an MIV subscript pair for dependence.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Can sometimes disprove the equal direction for 1 or more loops.
+  //  Can handle some symbolics that even the SIV tests don't get,
+  /// so we use it as a backup for everything.
+  bool gcdMIVtest(const SCEV *Src, const SCEV *Dst,
+                  FullDependence &Result) const;
+
+  /// banerjeeMIVtest - Tests an MIV subscript pair for dependence.
+  /// Returns true if any possible dependence is disproved.
+  /// Marks the result as inconsistent.
+  /// Computes directions.
+  bool banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
+                       const SmallBitVector &Loops,
+                       FullDependence &Result) const;
 
-  /// createDependenceAnalysisPass - This creates an instance of the
-  /// DependenceAnalysis wrapper pass.
-  LLVM_ABI FunctionPass *createDependenceAnalysisWrapperPass();
+  /// collectCoeffInfo - Walks through the subscript, collecting each
+  /// coefficient, the associated loop bounds, and recording its positive and
+  /// negative parts for later use.
+  CoefficientInfo *collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
+                                    const SCEV *&Constant) const;
+
+  /// getPositivePart - X^+ = max(X, 0).
+  const SCEV *getPositivePart(const SCEV *X) const;
+
+  /// getNegativePart - X^- = min(X, 0).
+  const SCEV *getNegativePart(const SCEV *X) const;
+
+  /// getLowerBound - Looks through all the bounds info and
+  /// computes the lower bound given the current direction settings
+  /// at each level.
+  const SCEV *getLowerBound(BoundInfo *Bound) const;
+
+  /// getUpperBound - Looks through all the bounds info and
+  /// computes the upper bound given the current direction settings
+  /// at each level.
+  const SCEV *getUpperBound(BoundInfo *Bound) const;
+
+  /// exploreDirections - Hierarchically expands the direction vector
+  /// search space, combining the directions of discovered dependences
+  /// in the DirSet field of Bound. Returns the number of distinct
+  /// dependences discovered. If the dependence is disproved,
+  /// it will return 0.
+  unsigned exploreDirections(unsigned Level, CoefficientInfo *A,
+                             CoefficientInfo *B, BoundInfo *Bound,
+                             const SmallBitVector &Loops,
+                             unsigned &DepthExpanded, const SCEV *Delta) const;
+
+  /// testBounds - Returns true iff the current bounds are plausible.
+  bool testBounds(unsigned char DirKind, unsigned Level, BoundInfo *Bound,
+                  const SCEV *Delta) const;
+
+  /// findBoundsALL - Computes the upper and lower bounds for level K
+  /// using the * direction. Records them in Bound.
+  void findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                     unsigned K) const;
+
+  /// findBoundsLT - Computes the upper and lower bounds for level K
+  /// using the < direction. Records them in Bound.
+  void findBoundsLT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                    unsigned K) const;
+
+  /// findBoundsGT - Computes the upper and lower bounds for level K
+  /// using the > direction. Records them in Bound.
+  void findBoundsGT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                    unsigned K) const;
+
+  /// findBoundsEQ - Computes the upper and lower bounds for level K
+  /// using the = direction. Records them in Bound.
+  void findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound,
+                    unsigned K) const;
+
+  /// intersectConstraints - Updates X with the intersection
+  /// of the Constraints X and Y. Returns true if X has changed.
+  bool intersectConstraints(Constraint *X, const Constraint *Y);
+
+  /// propagate - Review the constraints, looking for opportunities
+  /// to simplify a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  /// If the simplification isn't exact (that is, if it is conservative
+  /// in terms of dependence), set consistent to false.
+  bool propagate(const SCEV *&Src, const SCEV *&Dst, SmallBitVector &Loops,
+                 SmallVectorImpl<Constraint> &Constraints, bool &Consistent);
+
+  /// propagateDistance - Attempt to propagate a distance
+  /// constraint into a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  /// If the simplification isn't exact (that is, if it is conservative
+  /// in terms of dependence), set consistent to false.
+  bool propagateDistance(const SCEV *&Src, const SCEV *&Dst,
+                         Constraint &CurConstraint, bool &Consistent);
+
+  /// propagatePoint - Attempt to propagate a point
+  /// constraint into a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  bool propagatePoint(const SCEV *&Src, const SCEV *&Dst,
+                      Constraint &CurConstraint);
+
+  /// propagateLine - Attempt to propagate a line
+  /// constraint into a subscript pair (Src and Dst).
+  /// Return true if some simplification occurs.
+  /// If the simplification isn't exact (that is, if it is conservative
+  /// in terms of dependence), set consistent to false.
+  bool propagateLine(const SCEV *&Src, const SCEV *&Dst,
+                     Constraint &CurConstraint, bool &Consistent);
+
+  /// findCoefficient - Given a linear SCEV,
+  /// return the coefficient corresponding to specified loop.
+  /// If there isn't one, return the SCEV constant 0.
+  /// For example, given a*i + b*j + c*k, returning the coefficient
+  /// corresponding to the j loop would yield b.
+  const SCEV *findCoefficient(const SCEV *Expr, const Loop *TargetLoop) const;
+
+  /// zeroCoefficient - Given a linear SCEV,
+  /// return the SCEV given by zeroing out the coefficient
+  /// corresponding to the specified loop.
+  /// For example, given a*i + b*j + c*k, zeroing the coefficient
+  /// corresponding to the j loop would yield a*i + c*k.
+  const SCEV *zeroCoefficient(const SCEV *Expr, const Loop *TargetLoop) const;
+
+  /// addToCoefficient - Given a linear SCEV Expr,
+  /// return the SCEV given by adding some Value to the
+  /// coefficient corresponding to the specified TargetLoop.
+  /// For example, given a*i + b*j + c*k, adding 1 to the coefficient
+  /// corresponding to the j loop would yield a*i + (b+1)*j + c*k.
+  const SCEV *addToCoefficient(const SCEV *Expr, const Loop *TargetLoop,
+                               const SCEV *Value) const;
+
+  /// updateDirection - Update direction vector entry
+  /// based on the current constraint.
+  void updateDirection(Dependence::DVEntry &Level,
+                       const Constraint &CurConstraint) const;
+
+  /// Given a linear access function, tries to recover subscripts
+  /// for each dimension of the array element access.
+  bool tryDelinearize(Instruction *Src, Instruction *Dst,
+                      SmallVectorImpl<Subscript> &Pair);
+
+  /// Tries to delinearize \p Src and \p Dst access functions for a fixed size
+  /// multi-dimensional array. Calls tryDelinearizeFixedSizeImpl() to
+  /// delinearize \p Src and \p Dst separately,
+  bool tryDelinearizeFixedSize(Instruction *Src, Instruction *Dst,
+                               const SCEV *SrcAccessFn, const SCEV *DstAccessFn,
+                               SmallVectorImpl<const SCEV *> &SrcSubscripts,
+                               SmallVectorImpl<const SCEV *> &DstSubscripts);
+
+  /// Tries to delinearize access function for a multi-dimensional array with
+  /// symbolic runtime sizes.
+  /// Returns true upon success and false otherwise.
+  bool
+  tryDelinearizeParametricSize(Instruction *Src, Instruction *Dst,
+                               const SCEV *SrcAccessFn, const SCEV *DstAccessFn,
+                               SmallVectorImpl<const SCEV *> &SrcSubscripts,
+                               SmallVectorImpl<const SCEV *> &DstSubscripts);
+
+  /// checkSubscript - Helper function for checkSrcSubscript and
+  /// checkDstSubscript to avoid duplicate code
+  bool checkSubscript(const SCEV *Expr, const Loop *LoopNest,
+                      SmallBitVector &Loops, bool IsSrc);
+}; // class DependenceInfo
+
+/// AnalysisPass to compute dependence information in a function
+class DependenceAnalysis : public AnalysisInfoMixin<DependenceAnalysis> {
+public:
+  typedef DependenceInfo Result;
+  LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM);
+
+private:
+  LLVM_ABI static AnalysisKey Key;
+  friend struct AnalysisInfoMixin<DependenceAnalysis>;
+}; // class DependenceAnalysis
+
+/// Printer pass to dump DA results.
+struct DependenceAnalysisPrinterPass
+    : public PassInfoMixin<DependenceAnalysisPrinterPass> {
+  DependenceAnalysisPrinterPass(raw_ostream &OS, bool NormalizeResults = false)
+      : OS(OS), NormalizeResults(NormalizeResults) {}
+
+  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+
+  static bool isRequired() { return true; }
+
+private:
+  raw_ostream &OS;
+  bool NormalizeResults;
+}; // class DependenceAnalysisPrinterPass
+
+/// Legacy pass manager pass to access dependence information
+class LLVM_ABI DependenceAnalysisWrapperPass : public FunctionPass {
+public:
+  static char ID; // Class identification, replacement for typeinfo
+  DependenceAnalysisWrapperPass();
+
+  bool runOnFunction(Function &F) override;
+  void releaseMemory() override;
+  void getAnalysisUsage(AnalysisUsage &) const override;
+  void print(raw_ostream &, const Module * = nullptr) const override;
+  DependenceInfo &getDI() const;
+
+private:
+  std::unique_ptr<DependenceInfo> info;
+}; // class DependenceAnalysisWrapperPass
+
+/// createDependenceAnalysisPass - This creates an instance of the
+/// DependenceAnalysis wrapper pass.
+LLVM_ABI FunctionPass *createDependenceAnalysisWrapperPass();
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 967d166..1bcc442 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -137,6 +137,7 @@
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;         // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;         // (shared)ptr
+def llvm_constant_ptr_ty: LLVMQualPointerType<4>;         // (const)ptr
 def llvm_local_ptr_ty   : LLVMQualPointerType<5>;         // (local)ptr
 def llvm_tmem_ptr_ty    : LLVMQualPointerType<6>;         // (tensor memory)ptr
 def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>;  // (shared_cluster)ptr
@@ -2212,15 +2213,17 @@ def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d
 // Intrinsics for Prefetch and Prefetchu
 let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>] in {
   foreach level = ["L1", "L2"] in {
-    def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>;
-    def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>;
-    def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>;
+    def int_nvvm_prefetch_ # level : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
+    def int_nvvm_prefetch_global_ # level : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
+    def int_nvvm_prefetch_local_ # level : DefaultAttrsIntrinsic<[], [llvm_local_ptr_ty]>;
   }
 
+  def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>;
+  
   foreach eviction_priority = ["evict_normal", "evict_last"] in
-    def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>;
+    def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
 
-  def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>;
+  def int_nvvm_prefetchu_L1 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
 }
 
 // applypriority
diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h
index a331295..a94e578 100644
--- a/llvm/include/llvm/Support/DebugLog.h
+++ b/llvm/include/llvm/Support/DebugLog.h
@@ -56,6 +56,16 @@ namespace llvm {
   DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), LEVEL, DEBUG_TYPE)
 #define LDBG_LOG_LEVEL_1() LDBG_LOG_LEVEL(1)
 
+// We want the filename without the full path. We are using the __FILE__ macro
+// and a constexpr function to strip the path prefix. We can avoid the frontend
+// repeated evaluation of __FILE__ by using the __FILE_NAME__ when defined
+// (gcc and clang do) which contains the file name already.
+#if defined(__FILE_NAME__)
+#define __LLVM_FILE_NAME__ __FILE_NAME__
+#else
+#define __LLVM_FILE_NAME__ ::llvm::impl::getShortFileName(__FILE__)
+#endif
+
 #define DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE,     \
                                                 LINE)                          \
   for (bool _c =                                                               \
@@ -69,17 +79,8 @@ namespace llvm {
 
 #define DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, FILE)          \
   DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE, __LINE__)
-// When __SHORT_FILE__ is not defined, the File is the full path,
-// otherwise __SHORT_FILE__ is defined in CMake to provide the file name
-// without the path prefix.
-#if defined(__SHORT_FILE__)
 #define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE)                     \
-  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, __SHORT_FILE__)
-#else
-#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE)                     \
-  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE,                      \
-                                     ::llvm::impl::getShortFileName(__FILE__))
-#endif
+  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, __LLVM_FILE_NAME__)
 
 namespace impl {
 
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index 39a4c0b..af2e501 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -61,8 +61,7 @@ enum Name {
 LLVM_ABI bool DisplayGraph(StringRef Filename, bool wait = true,
                            GraphProgram::Name program = GraphProgram::DOT);
 
-template<typename GraphType>
-class GraphWriter {
+template <typename GraphType, typename Derived> class GraphWriterBase {
   raw_ostream &O;
   const GraphType &G;
   bool RenderUsingHTML = false;
@@ -75,9 +74,15 @@ class GraphWriter {
   DOTTraits DTraits;
 
   static_assert(std::is_pointer_v<NodeRef>,
-                "FIXME: Currently GraphWriter requires the NodeRef type to be "
-                "a pointer.\nThe pointer usage should be moved to "
-                "DOTGraphTraits, and removed from GraphWriter itself.");
+                "FIXME: Currently GraphWriterBase requires the NodeRef type to "
+                "be a pointer.\nThe pointer usage should be moved to "
+                "DOTGraphTraits, and removed from GraphWriterBase itself.");
+
+  // Cast the 'this' pointer to the derived type and return a reference.
+  Derived &getDerived() { return *static_cast<Derived *>(this); }
+  const Derived &getDerived() const {
+    return *static_cast<const Derived *>(this);
+  }
 
   // Writes the edge labels of the node to O and returns true if there are any
   // edge labels not equal to the empty string "".
@@ -118,23 +123,24 @@ class GraphWriter {
   }
 
 public:
-  GraphWriter(raw_ostream &o, const GraphType &g, bool SN) : O(o), G(g) {
+  GraphWriterBase(raw_ostream &o, const GraphType &g, bool SN) : O(o), G(g) {
     DTraits = DOTTraits(SN);
     RenderUsingHTML = DTraits.renderNodesUsingHTML();
   }
+  virtual ~GraphWriterBase() {}
 
   void writeGraph(const std::string &Title = "") {
     // Output the header for the graph...
-    writeHeader(Title);
+    getDerived().writeHeader(Title);
 
     // Emit all of the nodes in the graph...
-    writeNodes();
+    getDerived().writeNodes();
 
     // Output any customizations on the graph
-    DOTGraphTraits<GraphType>::addCustomGraphFeatures(G, *this);
+    DOTGraphTraits<GraphType>::addCustomGraphFeatures(G, getDerived());
 
     // Output the end of the graph
-    writeFooter();
+    getDerived().writeFooter();
   }
 
   void writeHeader(const std::string &Title) {
@@ -166,8 +172,8 @@ public:
   void writeNodes() {
     // Loop over the graph, printing it out...
     for (const auto Node : nodes<GraphType>(G))
-      if (!isNodeHidden(Node))
-        writeNode(Node);
+      if (!getDerived().isNodeHidden(Node))
+        getDerived().writeNode(Node);
   }
 
   bool isNodeHidden(NodeRef Node) { return DTraits.isNodeHidden(Node, G); }
@@ -302,9 +308,9 @@ public:
       if (DTraits.getEdgeSourceLabel(Node, EI).empty())
         edgeidx = -1;
 
-      emitEdge(static_cast<const void*>(Node), edgeidx,
-               static_cast<const void*>(TargetNode), DestPort,
-               DTraits.getEdgeAttributes(Node, EI, G));
+      getDerived().emitEdge(static_cast<const void *>(Node), edgeidx,
+                            static_cast<const void *>(TargetNode), DestPort,
+                            DTraits.getEdgeAttributes(Node, EI, G));
     }
   }
 
@@ -357,10 +363,17 @@ public:
   }
 };
 
-template<typename GraphType>
+template <typename GraphType>
+class GraphWriter : public GraphWriterBase<GraphType, GraphWriter<GraphType>> {
+public:
+  GraphWriter(raw_ostream &o, const GraphType &g, bool SN)
+      : GraphWriterBase<GraphType, GraphWriter<GraphType>>(o, g, SN) {}
+  ~GraphWriter() override {}
+};
+
+template <typename GraphType>
 raw_ostream &WriteGraph(raw_ostream &O, const GraphType &G,
-                        bool ShortNames = false,
-                        const Twine &Title = "") {
+                        bool ShortNames = false, const Twine &Title = "") {
   // Start the graph emission process...
   GraphWriter<GraphType> W(O, G, ShortNames);
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index f1473b2..256befa 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -180,8 +180,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
   for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
        ++SrcI) {
     if (SrcI->mayReadOrWriteMemory()) {
-      for (inst_iterator DstI = SrcI, DstE = inst_end(F);
-           DstI != DstE; ++DstI) {
+      for (inst_iterator DstI = SrcI, DstE = inst_end(F); DstI != DstE;
+           ++DstI) {
         if (DstI->mayReadOrWriteMemory()) {
           OS << "Src:" << *SrcI << " --> Dst:" << *DstI << "\n";
           OS << "  da analyze - ";
@@ -203,7 +203,7 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 
             // Normalize negative direction vectors if required by clients.
             if (NormalizeResults && D->normalize(&SE))
-                OS << "normalized - ";
+              OS << "normalized - ";
             D->dump(OS);
             for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
               if (D->isSplitable(Level)) {
@@ -227,8 +227,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 
 void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
                                           const Module *) const {
-  dumpExampleDependence(OS, info.get(),
-                        getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
+  dumpExampleDependence(
+      OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
 }
 
 PreservedAnalyses
@@ -249,33 +249,26 @@ bool Dependence::isInput() const {
   return Src->mayReadFromMemory() && Dst->mayReadFromMemory();
 }
 
-
 // Returns true if this is an output dependence.
 bool Dependence::isOutput() const {
   return Src->mayWriteToMemory() && Dst->mayWriteToMemory();
 }
 
-
 // Returns true if this is an flow (aka true)  dependence.
 bool Dependence::isFlow() const {
   return Src->mayWriteToMemory() && Dst->mayReadFromMemory();
 }
 
-
 // Returns true if this is an anti dependence.
 bool Dependence::isAnti() const {
   return Src->mayReadFromMemory() && Dst->mayWriteToMemory();
 }
 
-
 // Returns true if a particular level is scalar; that is,
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
 // Leave this out of line, so it will serve as a virtual method anchor
-bool Dependence::isScalar(unsigned level) const {
-  return false;
-}
-
+bool Dependence::isScalar(unsigned level) const { return false; }
 
 //===----------------------------------------------------------------------===//
 // FullDependence methods
@@ -338,8 +331,7 @@ bool FullDependence::normalize(ScalarEvolution *SE) {
     DV[Level - 1].Direction = RevDirection;
     // Reverse the dependence distance as well.
     if (DV[Level - 1].Distance != nullptr)
-      DV[Level - 1].Distance =
-          SE->getNegativeSCEV(DV[Level - 1].Distance);
+      DV[Level - 1].Distance = SE->getNegativeSCEV(DV[Level - 1].Distance);
   }
 
   LLVM_DEBUG(dbgs() << "After normalizing negative direction vectors:\n";
@@ -355,14 +347,12 @@ unsigned FullDependence::getDirection(unsigned Level) const {
   return DV[Level - 1].Direction;
 }
 
-
 // Returns the distance (or NULL) associated with a particular level.
 const SCEV *FullDependence::getDistance(unsigned Level) const {
   assert(0 < Level && Level <= Levels && "Level out of range");
   return DV[Level - 1].Distance;
 }
 
-
 // Returns true if a particular level is scalar; that is,
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
@@ -371,7 +361,6 @@ bool FullDependence::isScalar(unsigned Level) const {
   return DV[Level - 1].Scalar;
 }
 
-
 // Returns true if peeling the first iteration from this loop
 // will break this dependence.
 bool FullDependence::isPeelFirst(unsigned Level) const {
@@ -379,7 +368,6 @@ bool FullDependence::isPeelFirst(unsigned Level) const {
   return DV[Level - 1].PeelFirst;
 }
 
-
 // Returns true if peeling the last iteration from this loop
 // will break this dependence.
 bool FullDependence::isPeelLast(unsigned Level) const {
@@ -387,14 +375,12 @@ bool FullDependence::isPeelLast(unsigned Level) const {
   return DV[Level - 1].PeelLast;
 }
 
-
 // Returns true if splitting this loop will break the dependence.
 bool FullDependence::isSplitable(unsigned Level) const {
   assert(0 < Level && Level <= Levels && "Level out of range");
   return DV[Level - 1].Splitable;
 }
 
-
 //===----------------------------------------------------------------------===//
 // DependenceInfo::Constraint methods
 
@@ -405,7 +391,6 @@ const SCEV *DependenceInfo::Constraint::getX() const {
   return A;
 }
 
-
 // If constraint is a point <X, Y>, returns Y.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getY() const {
@@ -413,7 +398,6 @@ const SCEV *DependenceInfo::Constraint::getY() const {
   return B;
 }
 
-
 // If constraint is a line AX + BY = C, returns A.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getA() const {
@@ -422,7 +406,6 @@ const SCEV *DependenceInfo::Constraint::getA() const {
   return A;
 }
 
-
 // If constraint is a line AX + BY = C, returns B.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getB() const {
@@ -431,7 +414,6 @@ const SCEV *DependenceInfo::Constraint::getB() const {
   return B;
 }
 
-
 // If constraint is a line AX + BY = C, returns C.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getC() const {
@@ -440,7 +422,6 @@ const SCEV *DependenceInfo::Constraint::getC() const {
   return C;
 }
 
-
 // If constraint is a distance, returns D.
 // Otherwise assert.
 const SCEV *DependenceInfo::Constraint::getD() const {
@@ -448,7 +429,6 @@ const SCEV *DependenceInfo::Constraint::getD() const {
   return SE->getNegativeSCEV(C);
 }
 
-
 // Returns the loop associated with this constraint.
 const Loop *DependenceInfo::Constraint::getAssociatedLoop() const {
   assert((Kind == Distance || Kind == Line || Kind == Point) &&
@@ -499,17 +479,16 @@ LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   else if (isPoint())
     OS << " Point is <" << *getX() << ", " << *getY() << ">\n";
   else if (isDistance())
-    OS << " Distance is " << *getD() <<
-      " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n";
+    OS << " Distance is " << *getD() << " (" << *getA() << "*X + " << *getB()
+       << "*Y = " << *getC() << ")\n";
   else if (isLine())
-    OS << " Line is " << *getA() << "*X + " <<
-      *getB() << "*Y = " << *getC() << "\n";
+    OS << " Line is " << *getA() << "*X + " << *getB() << "*Y = " << *getC()
+       << "\n";
   else
     llvm_unreachable("unknown constraint type in Constraint::dump");
 }
 #endif
 
-
 // Updates X with the intersection
 // of the Constraints X and Y. Returns true if X has changed.
 // Corresponds to Figure 4 from the paper
@@ -591,15 +570,14 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
       const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB());
       const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB());
       const SCEVConstant *C1A2_C2A1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
       const SCEVConstant *C1B2_C2B1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
       const SCEVConstant *A1B2_A2B1 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
       const SCEVConstant *A2B1_A1B2 =
-        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
-      if (!C1B2_C2B1 || !C1A2_C2A1 ||
-          !A1B2_A2B1 || !A2B1_A1B2)
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
+      if (!C1B2_C2B1 || !C1A2_C2A1 || !A1B2_A2B1 || !A2B1_A1B2)
         return false;
       APInt Xtop = C1B2_C2B1->getAPInt();
       APInt Xbot = A1B2_A2B1->getAPInt();
@@ -626,8 +604,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
         ++DeltaSuccesses;
         return true;
       }
-      if (const SCEVConstant *CUB =
-          collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
+      if (const SCEVConstant *CUB = collectConstantUpperBound(
+              X->getAssociatedLoop(), Prod1->getType())) {
         const APInt &UpperBound = CUB->getAPInt();
         LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
         if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
@@ -636,8 +614,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
           return true;
         }
       }
-      X->setPoint(SE->getConstant(Xq),
-                  SE->getConstant(Yq),
+      X->setPoint(SE->getConstant(Xq), SE->getConstant(Yq),
                   X->getAssociatedLoop());
       ++DeltaSuccesses;
       return true;
@@ -667,7 +644,6 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // DependenceInfo methods
 
@@ -737,8 +713,7 @@ void Dependence::dump(raw_ostream &OS) const {
 // tbaa, non-overlapping regions etc), then it is known there is no dependecy.
 // Otherwise the underlying objects are checked to see if they point to
 // different identifiable objects.
-static AliasResult underlyingObjectsAlias(AAResults *AA,
-                                          const DataLayout &DL,
+static AliasResult underlyingObjectsAlias(AAResults *AA, const DataLayout &DL,
                                           const MemoryLocation &LocA,
                                           const MemoryLocation &LocB) {
   // Check the original locations (minus size) for noalias, which can happen for
@@ -773,8 +748,7 @@ static AliasResult underlyingObjectsAlias(AAResults *AA,
 
 // Returns true if the load or store can be analyzed. Atomic and volatile
 // operations have properties which this analysis does not understand.
-static
-bool isLoadOrStore(const Instruction *I) {
+static bool isLoadOrStore(const Instruction *I) {
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return LI->isUnordered();
   else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -782,7 +756,6 @@ bool isLoadOrStore(const Instruction *I) {
   return false;
 }
 
-
 // Examines the loop nesting of the Src and Dst
 // instructions and establishes their shared loops. Sets the variables
 // CommonLevels, SrcLevels, and MaxLevels.
@@ -860,14 +833,12 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src,
   MaxLevels -= CommonLevels;
 }
 
-
 // Given one of the loops containing the source, return
 // its level index in our numbering scheme.
 unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const {
   return SrcLoop->getLoopDepth();
 }
 
-
 // Given one of the loops containing the destination,
 // return its level index in our numbering scheme.
 unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
@@ -880,7 +851,6 @@ unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
     return D;
 }
 
-
 // Returns true if Expression is loop invariant in LoopNest.
 bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
                                      const Loop *LoopNest) const {
@@ -896,8 +866,6 @@ bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
   return SE->isLoopInvariant(Expression, LoopNest->getOutermostLoop());
 }
 
-
-
 // Finds the set of loops from the LoopNest that
 // have a level <= CommonLevels and are referred to by the SCEV Expression.
 void DependenceInfo::collectCommonLoops(const SCEV *Expression,
@@ -924,9 +892,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
-      assert(SrcTy == DstTy && "This function only unify integer types and "
-             "expect Src and Dst share the same type "
-             "otherwise.");
+      assert(SrcTy == DstTy &&
+             "This function only unify integer types and "
+             "expect Src and Dst share the same type otherwise.");
       continue;
     }
     if (SrcTy->getBitWidth() > widestWidthSeen) {
@@ -939,7 +907,6 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     }
   }
 
-
   assert(widestWidthSeen > 0);
 
   // Now extend each pair to the widest seen.
@@ -949,9 +916,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
-      assert(SrcTy == DstTy && "This function only unify integer types and "
-             "expect Src and Dst share the same type "
-             "otherwise.");
+      assert(SrcTy == DstTy &&
+             "This function only unify integer types and "
+             "expect Src and Dst share the same type otherwise.");
       continue;
     }
     if (SrcTy->getBitWidth() < widestWidthSeen)
@@ -1028,7 +995,6 @@ bool DependenceInfo::checkDstSubscript(const SCEV *Dst, const Loop *LoopNest,
   return checkSubscript(Dst, LoopNest, Loops, false);
 }
 
-
 // Examines the subscript pair (the Src and Dst SCEVs)
 // and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
 // Collects the associated loops in a set.
@@ -1049,14 +1015,12 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
     return Subscript::ZIV;
   if (N == 1)
     return Subscript::SIV;
-  if (N == 2 && (SrcLoops.count() == 0 ||
-                 DstLoops.count() == 0 ||
+  if (N == 2 && (SrcLoops.count() == 0 || DstLoops.count() == 0 ||
                  (SrcLoops.count() == 1 && DstLoops.count() == 1)))
     return Subscript::RDIV;
   return Subscript::MIV;
 }
 
-
 // A wrapper around SCEV::isKnownPredicate.
 // Looks for cases where we're interested in comparing for equality.
 // If both X and Y have been identically sign or zero extended,
@@ -1069,12 +1033,9 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
 // involving symbolics.
 bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
                                       const SCEV *Y) const {
-  if (Pred == CmpInst::ICMP_EQ ||
-      Pred == CmpInst::ICMP_NE) {
-    if ((isa<SCEVSignExtendExpr>(X) &&
-         isa<SCEVSignExtendExpr>(Y)) ||
-        (isa<SCEVZeroExtendExpr>(X) &&
-         isa<SCEVZeroExtendExpr>(Y))) {
+  if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
+    if ((isa<SCEVSignExtendExpr>(X) && isa<SCEVSignExtendExpr>(Y)) ||
+        (isa<SCEVZeroExtendExpr>(X) && isa<SCEVZeroExtendExpr>(Y))) {
       const SCEVIntegralCastExpr *CX = cast<SCEVIntegralCastExpr>(X);
       const SCEVIntegralCastExpr *CY = cast<SCEVIntegralCastExpr>(Y);
       const SCEV *Xop = CX->getOperand();
@@ -1111,7 +1072,10 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
   }
 }
 
-/// Compare to see if S is less than Size, using isKnownNegative(S - max(Size, 1))
+/// Compare to see if S is less than Size, using
+///
+///    isKnownNegative(S - max(Size, 1))
+///
 /// with some extra checking if S is an AddRec and we can prove less-than using
 /// the loop bounds.
 bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
@@ -1178,7 +1142,6 @@ const SCEV *DependenceInfo::collectUpperBound(const Loop *L, Type *T) const {
   return nullptr;
 }
 
-
 // Calls collectUpperBound(), then attempts to cast it to SCEVConstant.
 // If the cast fails, returns NULL.
 const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
@@ -1188,7 +1151,6 @@ const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
   return nullptr;
 }
 
-
 // testZIV -
 // When we have a pair of subscripts of the form [c1] and [c2],
 // where c1 and c2 are both loop invariant, we attack it using
@@ -1218,7 +1180,6 @@ bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst,
   return false; // possibly dependent
 }
 
-
 // strongSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.1
 //
@@ -1270,9 +1231,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
     LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
     const SCEV *AbsDelta =
-      SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
+        SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
     const SCEV *AbsCoeff =
-      SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+        SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
     const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
     if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
       // Distance greater than trip count - no dependence
@@ -1286,7 +1247,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) {
     APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt();
     APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt();
-    APInt Distance  = ConstDelta; // these need to be initialized
+    APInt Distance = ConstDelta; // these need to be initialized
     APInt Remainder = ConstDelta;
     APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
     LLVM_DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
@@ -1307,29 +1268,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     else
       Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
     ++StrongSIVsuccesses;
-  }
-  else if (Delta->isZero()) {
+  } else if (Delta->isZero()) {
     // since 0/X == 0
     Result.DV[Level].Distance = Delta;
     NewConstraint.setDistance(Delta, CurLoop);
     Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
     ++StrongSIVsuccesses;
-  }
-  else {
+  } else {
     if (Coeff->isOne()) {
       LLVM_DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
       Result.DV[Level].Distance = Delta; // since X/1 == X
       NewConstraint.setDistance(Delta, CurLoop);
-    }
-    else {
+    } else {
       Result.Consistent = false;
-      NewConstraint.setLine(Coeff,
-                            SE->getNegativeSCEV(Coeff),
+      NewConstraint.setLine(Coeff, SE->getNegativeSCEV(Coeff),
                             SE->getNegativeSCEV(Delta), CurLoop);
     }
 
     // maybe we can get a useful direction
-    bool DeltaMaybeZero     = !SE->isKnownNonZero(Delta);
+    bool DeltaMaybeZero = !SE->isKnownNonZero(Delta);
     bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta);
     bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta);
     bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff);
@@ -1353,7 +1310,6 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   return false;
 }
 
-
 // weakCrossingSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1447,8 +1403,8 @@ bool DependenceInfo::weakCrossingSIVtest(
   if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
-    const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
-                                    ConstantTwo);
+    const SCEV *ML =
+        SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), ConstantTwo);
     LLVM_DEBUG(dbgs() << "\t    ML = " << *ML << "\n");
     if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) {
       // Delta too big, no dependence
@@ -1498,7 +1454,6 @@ bool DependenceInfo::weakCrossingSIVtest(
   return false;
 }
 
-
 // Kirch's algorithm, from
 //
 //        Optimizing Supercompilers for Supercomputers
@@ -1519,9 +1474,11 @@ static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM,
   APInt R = G0;
   APInt::sdivrem(G0, G1, Q, R);
   while (R != 0) {
+    // clang-format off
     APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2;
     APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2;
     G0 = G1; G1 = R;
+    // clang-format on
     APInt::sdivrem(G0, G1, Q, R);
   }
   G = G1;
@@ -1543,8 +1500,7 @@ static APInt floorOfQuotient(const APInt &A, const APInt &B) {
   APInt::sdivrem(A, B, Q, R);
   if (R == 0)
     return Q;
-  if ((A.sgt(0) && B.sgt(0)) ||
-      (A.slt(0) && B.slt(0)))
+  if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
     return Q;
   else
     return Q - 1;
@@ -1556,8 +1512,7 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
   APInt::sdivrem(A, B, Q, R);
   if (R == 0)
     return Q;
-  if ((A.sgt(0) && B.sgt(0)) ||
-      (A.slt(0) && B.slt(0)))
+  if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0)))
     return Q + 1;
   else
     return Q;
@@ -1733,17 +1688,14 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   return Result.DV[Level].Direction == Dependence::DVEntry::NONE;
 }
 
-
 // Return true if the divisor evenly divides the dividend.
-static
-bool isRemainderZero(const SCEVConstant *Dividend,
-                     const SCEVConstant *Divisor) {
+static bool isRemainderZero(const SCEVConstant *Dividend,
+                            const SCEVConstant *Divisor) {
   const APInt &ConstDividend = Dividend->getAPInt();
   const APInt &ConstDivisor = Divisor->getAPInt();
   return ConstDividend.srem(ConstDivisor) == 0;
 }
 
-
 // weakZeroSrcSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1807,11 +1759,11 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
   if (!ConstCoeff)
     return false;
-  const SCEV *AbsCoeff =
-    SE->isKnownNegative(ConstCoeff) ?
-    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+                             ? SE->getNegativeSCEV(ConstCoeff)
+                             : ConstCoeff;
   const SCEV *NewDelta =
-    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+      SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
@@ -1853,7 +1805,6 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   return false;
 }
 
-
 // weakZeroDstSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1916,11 +1867,11 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
   if (!ConstCoeff)
     return false;
-  const SCEV *AbsCoeff =
-    SE->isKnownNegative(ConstCoeff) ?
-    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
+                             ? SE->getNegativeSCEV(ConstCoeff)
+                             : ConstCoeff;
   const SCEV *NewDelta =
-    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+      SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
@@ -1962,7 +1913,6 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   return false;
 }
 
-
 // exactRDIVtest - Tests the RDIV subscript pair for dependence.
 // Things of the form [c1 + a*i] and [c2 + b*j],
 // where i and j are induction variable, c1 and c2 are loop invariant,
@@ -2084,7 +2034,6 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   return TL.sgt(TU);
 }
 
-
 // symbolicRDIVtest -
 // In Section 4.5 of the Practical Dependence Testing paper,the authors
 // introduce a special case of Banerjee's Inequalities (also called the
@@ -2167,8 +2116,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
           return true;
         }
       }
-    }
-    else if (SE->isKnownNonPositive(A2)) {
+    } else if (SE->isKnownNonPositive(A2)) {
       // a1 >= 0 && a2 <= 0
       if (N1 && N2) {
         // make sure that c2 - c1 <= a1*N1 - a2*N2
@@ -2187,8 +2135,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         return true;
       }
     }
-  }
-  else if (SE->isKnownNonPositive(A1)) {
+  } else if (SE->isKnownNonPositive(A1)) {
     if (SE->isKnownNonNegative(A2)) {
       // a1 <= 0 && a2 >= 0
       if (N1 && N2) {
@@ -2207,8 +2154,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
         ++SymbolicRDIVindependence;
         return true;
       }
-    }
-    else if (SE->isKnownNonPositive(A2)) {
+    } else if (SE->isKnownNonPositive(A2)) {
       // a1 <= 0 && a2 <= 0
       if (N1) {
         // make sure that a1*N1 <= c2 - c1
@@ -2233,7 +2179,6 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
   return false;
 }
 
-
 // testSIV -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i]
 // where i is an induction variable, c1 and c2 are loop invariant, and a1 and
@@ -2260,17 +2205,17 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     Level = mapSrcLoop(CurLoop);
     bool disproven;
     if (SrcCoeff == DstCoeff)
-      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                                Level, Result, NewConstraint);
+      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+                                Result, NewConstraint);
     else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff))
       disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
                                       Level, Result, NewConstraint, SplitIter);
     else
       disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
                                Level, Result, NewConstraint);
-    return disproven ||
-      gcdMIVtest(Src, Dst, Result) ||
-      symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop);
+    return disproven || gcdMIVtest(Src, Dst, Result) ||
+           symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
+                            CurLoop);
   }
   if (SrcAddRec) {
     const SCEV *SrcConst = SrcAddRec->getStart();
@@ -2278,9 +2223,9 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     const SCEV *DstConst = Dst;
     const Loop *CurLoop = SrcAddRec->getLoop();
     Level = mapSrcLoop(CurLoop);
-    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                              Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level,
+                              Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   if (DstAddRec) {
     const SCEV *DstConst = DstAddRec->getStart();
@@ -2288,15 +2233,14 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     const SCEV *SrcConst = Src;
     const Loop *CurLoop = DstAddRec->getLoop();
     Level = mapDstLoop(CurLoop);
-    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst,
-                              CurLoop, Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, CurLoop, Level,
+                              Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   llvm_unreachable("SIV test expected at least one AddRec");
   return false;
 }
 
-
 // testRDIV -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
 // where i and j are induction variables, c1 and c2 are loop invariant,
@@ -2333,46 +2277,37 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst,
     DstConst = DstAddRec->getStart();
     DstCoeff = DstAddRec->getStepRecurrence(*SE);
     DstLoop = DstAddRec->getLoop();
-  }
-  else if (SrcAddRec) {
+  } else if (SrcAddRec) {
     if (const SCEVAddRecExpr *tmpAddRec =
-        dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
+            dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
       SrcConst = tmpAddRec->getStart();
       SrcCoeff = tmpAddRec->getStepRecurrence(*SE);
       SrcLoop = tmpAddRec->getLoop();
       DstConst = Dst;
       DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE));
       DstLoop = SrcAddRec->getLoop();
-    }
-    else
+    } else
       llvm_unreachable("RDIV reached by surprising SCEVs");
-  }
-  else if (DstAddRec) {
+  } else if (DstAddRec) {
     if (const SCEVAddRecExpr *tmpAddRec =
-        dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
+            dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
       DstConst = tmpAddRec->getStart();
       DstCoeff = tmpAddRec->getStepRecurrence(*SE);
       DstLoop = tmpAddRec->getLoop();
       SrcConst = Src;
       SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE));
       SrcLoop = DstAddRec->getLoop();
-    }
-    else
+    } else
       llvm_unreachable("RDIV reached by surprising SCEVs");
-  }
-  else
+  } else
     llvm_unreachable("RDIV expected at least one AddRec");
-  return exactRDIVtest(SrcCoeff, DstCoeff,
-                       SrcConst, DstConst,
-                       SrcLoop, DstLoop,
+  return exactRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, DstLoop,
                        Result) ||
-    gcdMIVtest(Src, Dst, Result) ||
-    symbolicRDIVtest(SrcCoeff, DstCoeff,
-                     SrcConst, DstConst,
-                     SrcLoop, DstLoop);
+         gcdMIVtest(Src, Dst, Result) ||
+         symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop,
+                          DstLoop);
 }
 
-
 // Tests the single-subscript MIV pair (Src and Dst) for dependence.
 // Return true if dependence disproved.
 // Can sometimes refine direction vectors.
@@ -2383,7 +2318,7 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,
   LLVM_DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   Result.Consistent = false;
   return gcdMIVtest(Src, Dst, Result) ||
-    banerjeeMIVtest(Src, Dst, Loops, Result);
+         banerjeeMIVtest(Src, Dst, Loops, Result);
 }
 
 // Given a product, e.g., 10*X*Y, returns the first constant operand,
@@ -2428,7 +2363,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   // we can't quit the loop just because the GCD == 1.
   const SCEV *Coefficients = Src;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
@@ -2446,7 +2381,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   // we can't quit the loop just because the GCD == 1.
   Coefficients = Dst;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
@@ -2468,16 +2403,14 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
       if (isa<SCEVConstant>(Operand)) {
         assert(!Constant && "Surprised to find multiple constants");
         Constant = cast<SCEVConstant>(Operand);
-      }
-      else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
+      } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
         // Search for constant operand to participate in GCD;
         // If none found; return false.
         std::optional<APInt> ConstOp = getConstantPart(Product);
         if (!ConstOp)
           return false;
         ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs());
-      }
-      else
+      } else
         return false;
     }
   }
@@ -2512,7 +2445,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   bool Improved = false;
   Coefficients = Src;
   while (const SCEVAddRecExpr *AddRec =
-         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+             dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     Coefficients = AddRec->getStart();
     const Loop *CurLoop = AddRec->getLoop();
     RunningGCD = ExtraGCD;
@@ -2578,7 +2511,6 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // banerjeeMIVtest -
 // Use Banerjee's Inequalities to test an MIV subscript pair.
@@ -2652,8 +2584,8 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
   if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) {
     // Explore the direction vector hierarchy.
     unsigned DepthExpanded = 0;
-    unsigned NewDeps = exploreDirections(1, A, B, Bound,
-                                         Loops, DepthExpanded, Delta);
+    unsigned NewDeps =
+        exploreDirections(1, A, B, Bound, Loops, DepthExpanded, Delta);
     if (NewDeps > 0) {
       bool Improved = false;
       for (unsigned K = 1; K <= CommonLevels; ++K) {
@@ -2670,23 +2602,20 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
       }
       if (Improved)
         ++BanerjeeSuccesses;
-    }
-    else {
+    } else {
       ++BanerjeeIndependence;
       Disproved = true;
     }
-  }
-  else {
+  } else {
     ++BanerjeeIndependence;
     Disproved = true;
   }
-  delete [] Bound;
-  delete [] A;
-  delete [] B;
+  delete[] Bound;
+  delete[] A;
+  delete[] B;
   return Disproved;
 }
 
-
 // Hierarchically expands the direction vector
 // search space, combining the directions of discovered dependences
 // in the DirSet field of Bound. Returns the number of distinct
@@ -2788,27 +2717,26 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
 
     // test bounds for <, *, *, ...
     if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     // Test bounds for =, *, *, ...
     if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     // test bounds for >, *, *, ...
     if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta))
-      NewDeps += exploreDirections(Level + 1, A, B, Bound,
-                                   Loops, DepthExpanded, Delta);
+      NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                                   Delta);
 
     Bound[Level].Direction = Dependence::DVEntry::ALL;
     return NewDeps;
-  }
-  else
-    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta);
+  } else
+    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded,
+                             Delta);
 }
 
-
 // Returns true iff the current bounds are plausible.
 bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
                                 BoundInfo *Bound, const SCEV *Delta) const {
@@ -2822,7 +2750,6 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
   return true;
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the * direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2840,17 +2767,16 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
 // and the upper bound is always >= 0.
 void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
                                    BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::ALL] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::ALL] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
-    Bound[K].Lower[Dependence::DVEntry::ALL] =
-      SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart),
-                     Bound[K].Iterations);
-    Bound[K].Upper[Dependence::DVEntry::ALL] =
-      SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart),
-                     Bound[K].Iterations);
-  }
-  else {
+    Bound[K].Lower[Dependence::DVEntry::ALL] = SE->getMulExpr(
+        SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), Bound[K].Iterations);
+    Bound[K].Upper[Dependence::DVEntry::ALL] = SE->getMulExpr(
+        SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), Bound[K].Iterations);
+  } else {
     // If the difference is 0, we won't need to know the number of iterations.
     if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart))
       Bound[K].Lower[Dependence::DVEntry::ALL] =
@@ -2861,7 +2787,6 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the = direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2879,18 +2804,19 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
 // and the upper bound is always >= 0.
 void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::EQ] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::EQ] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
     const SCEV *NegativePart = getNegativePart(Delta);
     Bound[K].Lower[Dependence::DVEntry::EQ] =
-      SE->getMulExpr(NegativePart, Bound[K].Iterations);
+        SE->getMulExpr(NegativePart, Bound[K].Iterations);
     const SCEV *PositivePart = getPositivePart(Delta);
     Bound[K].Upper[Dependence::DVEntry::EQ] =
-      SE->getMulExpr(PositivePart, Bound[K].Iterations);
-  }
-  else {
+        SE->getMulExpr(PositivePart, Bound[K].Iterations);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
     const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
@@ -2903,7 +2829,6 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the < direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2919,35 +2844,35 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
 // We must be careful to handle the case where the upper bound is unknown.
 void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::LT] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::LT] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 = SE->getMinusSCEV(
         Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+        getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
     Bound[K].Lower[Dependence::DVEntry::LT] =
-      SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
+        SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+        getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
     Bound[K].Upper[Dependence::DVEntry::LT] =
-      SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
-  }
-  else {
+        SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+        getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
     if (NegPart->isZero())
       Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+        getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
     if (PosPart->isZero())
       Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
   }
 }
 
-
 // Computes the upper and lower bounds for level K
 // using the > direction. Records them in Bound.
 // Wolfe gives the equations
@@ -2963,45 +2888,45 @@ void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
 // We must be careful to handle the case where the upper bound is unknown.
 void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B,
                                   BoundInfo *Bound, unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::GT] =
+      nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::GT] =
+      nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 = SE->getMinusSCEV(
         Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
-      getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+        getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
     Bound[K].Lower[Dependence::DVEntry::GT] =
-      SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
+        SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
     const SCEV *PosPart =
-      getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+        getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
     Bound[K].Upper[Dependence::DVEntry::GT] =
-      SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
-  }
-  else {
+        SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
+  } else {
     // If the positive/negative part of the difference is 0,
     // we won't need to know the number of iterations.
-    const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+    const SCEV *NegPart =
+        getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
     if (NegPart->isZero())
       Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff;
-    const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+    const SCEV *PosPart =
+        getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
     if (PosPart->isZero())
       Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff;
   }
 }
 
-
 // X^+ = max(X, 0)
 const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const {
   return SE->getSMaxExpr(X, SE->getZero(X->getType()));
 }
 
-
 // X^- = min(X, 0)
 const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const {
   return SE->getSMinExpr(X, SE->getZero(X->getType()));
 }
 
-
 // Walks through the subscript,
 // collecting each coefficient, the associated loop bounds,
 // and recording its positive and negative parts for later use.
@@ -3046,7 +2971,6 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
   return CI;
 }
 
-
 // Looks through all the bounds info and
 // computes the lower bound given the current direction settings
 // at each level. If the lower bound for any level is -inf,
@@ -3062,7 +2986,6 @@ const SCEV *DependenceInfo::getLowerBound(BoundInfo *Bound) const {
   return Sum;
 }
 
-
 // Looks through all the bounds info and
 // computes the upper bound given the current direction settings
 // at each level. If the upper bound at any level is +inf,
@@ -3078,7 +3001,6 @@ const SCEV *DependenceInfo::getUpperBound(BoundInfo *Bound) const {
   return Sum;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Constraint manipulation for Delta test.
 
@@ -3098,7 +3020,6 @@ const SCEV *DependenceInfo::findCoefficient(const SCEV *Expr,
   return findCoefficient(AddRec->getStart(), TargetLoop);
 }
 
-
 // Given a linear SCEV,
 // return the SCEV given by zeroing out the coefficient
 // corresponding to the specified loop.
@@ -3112,12 +3033,10 @@ const SCEV *DependenceInfo::zeroCoefficient(const SCEV *Expr,
   if (AddRec->getLoop() == TargetLoop)
     return AddRec->getStart();
   return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop),
-                           AddRec->getStepRecurrence(*SE),
-                           AddRec->getLoop(),
+                           AddRec->getStepRecurrence(*SE), AddRec->getLoop(),
                            AddRec->getNoWrapFlags());
 }
 
-
 // Given a linear SCEV Expr,
 // return the SCEV given by adding some Value to the
 // coefficient corresponding to the specified TargetLoop.
@@ -3128,17 +3047,13 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
                                              const SCEV *Value) const {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec) // create a new addRec
-    return SE->getAddRecExpr(Expr,
-                             Value,
-                             TargetLoop,
+    return SE->getAddRecExpr(Expr, Value, TargetLoop,
                              SCEV::FlagAnyWrap); // Worst case, with no info.
   if (AddRec->getLoop() == TargetLoop) {
     const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value);
     if (Sum->isZero())
       return AddRec->getStart();
-    return SE->getAddRecExpr(AddRec->getStart(),
-                             Sum,
-                             AddRec->getLoop(),
+    return SE->getAddRecExpr(AddRec->getStart(), Sum, AddRec->getLoop(),
                              AddRec->getNoWrapFlags());
   }
   if (SE->isLoopInvariant(AddRec, TargetLoop))
@@ -3149,7 +3064,6 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
       AddRec->getNoWrapFlags());
 }
 
-
 // Review the constraints, looking for opportunities
 // to simplify a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3178,7 +3092,6 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
   return Result;
 }
 
-
 // Attempt to propagate a distance
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3204,7 +3117,6 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Attempt to propagate a line
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3224,22 +3136,22 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
   if (A->isZero()) {
     const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Bconst || !Cconst) return false;
+    if (!Bconst || !Cconst)
+      return false;
     APInt Beta = Bconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivB = Charlie.sdiv(Beta);
     assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
     const SCEV *AP_K = findCoefficient(Dst, CurLoop);
-    //    Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
     Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
     Dst = zeroCoefficient(Dst, CurLoop);
     if (!findCoefficient(Src, CurLoop)->isZero())
       Consistent = false;
-  }
-  else if (B->isZero()) {
+  } else if (B->isZero()) {
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Aconst || !Cconst) return false;
+    if (!Aconst || !Cconst)
+      return false;
     APInt Alpha = Aconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
@@ -3249,11 +3161,11 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     Src = zeroCoefficient(Src, CurLoop);
     if (!findCoefficient(Dst, CurLoop)->isZero())
       Consistent = false;
-  }
-  else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
+  } else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
-    if (!Aconst || !Cconst) return false;
+    if (!Aconst || !Cconst)
+      return false;
     APInt Alpha = Aconst->getAPInt();
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
@@ -3264,8 +3176,7 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     Dst = addToCoefficient(Dst, CurLoop, A_K);
     if (!findCoefficient(Dst, CurLoop)->isZero())
       Consistent = false;
-  }
-  else {
+  } else {
     // paper is incorrect here, or perhaps just misleading
     const SCEV *A_K = findCoefficient(Src, CurLoop);
     Src = SE->getMulExpr(Src, A);
@@ -3281,7 +3192,6 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Attempt to propagate a point
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
@@ -3302,7 +3212,6 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
   return true;
 }
 
-
 // Update direction vector entry based on the current constraint.
 void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
                                      const Constraint &CurConstraint) const {
@@ -3322,34 +3231,28 @@ void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
     if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative
       NewDirection |= Dependence::DVEntry::GT;
     Level.Direction &= NewDirection;
-  }
-  else if (CurConstraint.isLine()) {
+  } else if (CurConstraint.isLine()) {
     Level.Scalar = false;
     Level.Distance = nullptr;
     // direction should be accurate
-  }
-  else if (CurConstraint.isPoint()) {
+  } else if (CurConstraint.isPoint()) {
     Level.Scalar = false;
     Level.Distance = nullptr;
     unsigned NewDirection = Dependence::DVEntry::NONE;
-    if (!isKnownPredicate(CmpInst::ICMP_NE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_NE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if X may be = Y
       NewDirection |= Dependence::DVEntry::EQ;
-    if (!isKnownPredicate(CmpInst::ICMP_SLE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_SLE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if Y may be > X
       NewDirection |= Dependence::DVEntry::LT;
-    if (!isKnownPredicate(CmpInst::ICMP_SGE,
-                          CurConstraint.getY(),
+    if (!isKnownPredicate(CmpInst::ICMP_SGE, CurConstraint.getY(),
                           CurConstraint.getX()))
       // if Y may be < X
       NewDirection |= Dependence::DVEntry::GT;
     Level.Direction &= NewDirection;
-  }
-  else
+  } else
     llvm_unreachable("constraint has unexpected kind");
 }
 
@@ -3425,7 +3328,7 @@ bool DependenceInfo::tryDelinearizeFixedSize(
         dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
     assert(SrcBase && DstBase && SrcBase == DstBase &&
            "expected src and dst scev unknowns to be equal");
-    });
+  });
 
   SmallVector<int, 4> SrcSizes;
   SmallVector<int, 4> DstSizes;
@@ -3737,9 +3640,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     Pair[P].Group.resize(Pairs);
     removeMatchingExtensions(&Pair[P]);
     Pair[P].Classification =
-      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
-                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
-                   Pair[P].Loops);
+        classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+                     LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
     Pair[P].GroupLoops = Pair[P].Loops;
     Pair[P].Group.set(P);
     LLVM_DEBUG(dbgs() << "    subscript " << P << "\n");
@@ -3814,18 +3716,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     if (Pair[SI].Classification == Subscript::NonLinear) {
       // ignore these, but collect loops for later
       ++NonlinearSubscriptPairs;
-      collectCommonLoops(Pair[SI].Src,
-                         LI->getLoopFor(Src->getParent()),
+      collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
                          Pair[SI].Loops);
-      collectCommonLoops(Pair[SI].Dst,
-                         LI->getLoopFor(Dst->getParent()),
+      collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
                          Pair[SI].Loops);
       Result.Consistent = false;
     } else if (Pair[SI].Classification == Subscript::ZIV) {
       // always separable
       Separable.set(SI);
-    }
-    else {
+    } else {
       // SIV, RDIV, or MIV, so check for coupled group
       bool Done = true;
       for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
@@ -3843,8 +3742,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         if (Pair[SI].Group.count() == 1) {
           Separable.set(SI);
           ++SeparableSubscriptPairs;
-        }
-        else {
+        } else {
           Coupled.set(SI);
           ++CoupledSubscriptPairs;
         }
@@ -3950,10 +3848,9 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
                           Constraints, Result.Consistent)) {
               LLVM_DEBUG(dbgs() << "\t    Changed\n");
               ++DeltaPropagations;
-              Pair[SJ].Classification =
-                classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
-                             Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
-                             Pair[SJ].Loops);
+              Pair[SJ].Classification = classifyPair(
+                  Pair[SJ].Src, LI->getLoopFor(Src->getParent()), Pair[SJ].Dst,
+                  LI->getLoopFor(Dst->getParent()), Pair[SJ].Loops);
               switch (Pair[SJ].Classification) {
               case Subscript::ZIV:
                 LLVM_DEBUG(dbgs() << "ZIV\n");
@@ -3995,8 +3892,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
           LLVM_DEBUG(dbgs() << "MIV test\n");
           if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
             return nullptr;
-        }
-        else
+        } else
           llvm_unreachable("expected only MIV subscripts at this point");
       }
 
@@ -4052,8 +3948,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
         break;
       }
     }
-  }
-  else {
+  } else {
     // On the other hand, if all directions are equal and there's no
     // loop-independent dependence possible, then no dependence exists.
     bool AllEqual = true;
@@ -4158,9 +4053,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
     Pair[P].Group.resize(Pairs);
     removeMatchingExtensions(&Pair[P]);
     Pair[P].Classification =
-      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
-                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
-                   Pair[P].Loops);
+        classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+                     LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
     Pair[P].GroupLoops = Pair[P].Loops;
     Pair[P].Group.set(P);
   }
@@ -4172,15 +4066,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   for (unsigned SI = 0; SI < Pairs; ++SI) {
     if (Pair[SI].Classification == Subscript::NonLinear) {
       // ignore these, but collect loops for later
-      collectCommonLoops(Pair[SI].Src,
-                         LI->getLoopFor(Src->getParent()),
+      collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()),
                          Pair[SI].Loops);
-      collectCommonLoops(Pair[SI].Dst,
-                         LI->getLoopFor(Dst->getParent()),
+      collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()),
                          Pair[SI].Loops);
       Result.Consistent = false;
-    }
-    else if (Pair[SI].Classification == Subscript::ZIV)
+    } else if (Pair[SI].Classification == Subscript::ZIV)
       Separable.set(SI);
     else {
       // SIV, RDIV, or MIV, so check for coupled group
@@ -4214,8 +4105,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
     case Subscript::SIV: {
       unsigned Level;
       const SCEV *SplitIter = nullptr;
-      (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
-                     Result, NewConstraint, SplitIter);
+      (void)testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint,
+                    SplitIter);
       if (Level == SplitLevel) {
         assert(SplitIter != nullptr);
         return SplitIter;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index b3b4c37..425ea31 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::exp:
   case Intrinsic::exp10:
   case Intrinsic::exp2:
+  case Intrinsic::ldexp:
   case Intrinsic::log:
   case Intrinsic::log10:
   case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_is_fpclass:
     return OpdIdx == 0;
   case Intrinsic::powi:
+  case Intrinsic::ldexp:
     return OpdIdx == -1 || OpdIdx == 1;
   default:
     return OpdIdx == -1;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 17a01f48..bf4c9f9 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1008,7 +1008,7 @@ unsigned TargetLoweringBase::getBitWidthForCttzElements(
     CR = CR.subtract(APInt(64, 1));
 
   unsigned EltWidth = RetTy->getScalarSizeInBits();
-  EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
+  EltWidth = std::min(EltWidth, CR.getActiveBits());
   EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
 
   return EltWidth;
diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index 4ed5982..f5c4778 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -305,7 +305,7 @@ llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) {
       return (REG_INVARG);
     len = preg->re_endp - pattern;
   } else {
-    len = strlen((const char *)pattern);
+    len = strlen(pattern);
   }
 
   /* do the mallocs early so failure handling is easy */
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a40de86b..3c06c6a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14742,6 +14742,106 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   return ResultSLI;
 }
 
+static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                               const AArch64TargetLowering &TLI) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+
+  if (!VT.isVector())
+    return SDValue();
+
+  if (VT.isScalableVector() && !Subtarget.hasSVE2())
+    return SDValue();
+
+  if (VT.isFixedLengthVector() &&
+      (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // InstCombine does (not (neg a)) => (add a -1).
+  // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
+  // Loop over all combinations of AND operands.
+  for (int i = 1; i >= 0; --i) {
+    for (int j = 1; j >= 0; --j) {
+      SDValue O0 = N0->getOperand(i);
+      SDValue O1 = N1->getOperand(j);
+      SDValue Sub, Add, SubSibling, AddSibling;
+
+      // Find a SUB and an ADD operand, one from each AND.
+      if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
+        Sub = O0;
+        Add = O1;
+        SubSibling = N0->getOperand(1 - i);
+        AddSibling = N1->getOperand(1 - j);
+      } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
+        Add = O0;
+        Sub = O1;
+        AddSibling = N0->getOperand(1 - i);
+        SubSibling = N1->getOperand(1 - j);
+      } else
+        continue;
+
+      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
+        continue;
+
+      // Constant ones is always righthand operand of the Add.
+      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
+        continue;
+
+      if (Sub.getOperand(1) != Add.getOperand(0))
+        continue;
+
+      return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
+    }
+  }
+
+  // (or (and a b) (and (not a) c)) => (bsl a b c)
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getScalarSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      APInt Val1, Val2;
+
+      if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
+          ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
+          (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+      }
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
@@ -19419,106 +19519,6 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
   return FixConv;
 }
 
-static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                               const AArch64TargetLowering &TLI) {
-  EVT VT = N->getValueType(0);
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
-
-  if (!VT.isVector())
-    return SDValue();
-
-  if (VT.isScalableVector() && !Subtarget.hasSVE2())
-    return SDValue();
-
-  if (VT.isFixedLengthVector() &&
-      (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
-
-  SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // InstCombine does (not (neg a)) => (add a -1).
-  // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
-  // Loop over all combinations of AND operands.
-  for (int i = 1; i >= 0; --i) {
-    for (int j = 1; j >= 0; --j) {
-      SDValue O0 = N0->getOperand(i);
-      SDValue O1 = N1->getOperand(j);
-      SDValue Sub, Add, SubSibling, AddSibling;
-
-      // Find a SUB and an ADD operand, one from each AND.
-      if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
-        Sub = O0;
-        Add = O1;
-        SubSibling = N0->getOperand(1 - i);
-        AddSibling = N1->getOperand(1 - j);
-      } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
-        Add = O0;
-        Sub = O1;
-        AddSibling = N0->getOperand(1 - i);
-        SubSibling = N1->getOperand(1 - j);
-      } else
-        continue;
-
-      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
-        continue;
-
-      // Constant ones is always righthand operand of the Add.
-      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
-        continue;
-
-      if (Sub.getOperand(1) != Add.getOperand(0))
-        continue;
-
-      return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
-    }
-  }
-
-  // (or (and a b) (and (not a) c)) => (bsl a b c)
-  // We only have to look for constant vectors here since the general, variable
-  // case can be handled in TableGen.
-  unsigned Bits = VT.getScalarSizeInBits();
-  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
-  for (int i = 1; i >= 0; --i)
-    for (int j = 1; j >= 0; --j) {
-      APInt Val1, Val2;
-
-      if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
-          ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
-          (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
-                           N0->getOperand(1 - i), N1->getOperand(1 - j));
-      }
-      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
-        continue;
-
-      bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
-        }
-      }
-      if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
-                           N0->getOperand(1 - i), N1->getOperand(1 - j));
-    }
-
-  return SDValue();
-}
-
 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
 // convert to csel(ccmp(.., cc0)), depending on cc1:
 
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index b97d622..fd4ef2a 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
 //
 // This pass performs below peephole optimizations on MIR level.
 //
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-//    MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+//    MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
 //
 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
 //    MOVi64imm + ADDXrr ==> ADDXri + ADDXri
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   // Strategy used to split logical immediate bitmasks.
   enum class SplitStrategy {
     Intersect,
+    Disjoint,
   };
   template <typename T>
   bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
@@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
 template <typename T>
 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
   T UImm = static_cast<T>(Imm);
+  assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!");
 
   // The bitmask immediate consists of consecutive ones.  Let's say there is
   // constant 0b00000000001000000000010000000000 which does not consist of
@@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 }
 
 template <typename T>
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+                                    T &Imm2Enc) {
+  assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!");
+
+  // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+  // two disjoint masks such as 0b00000000011000000000000000000000 and
+  // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+  // new masks match the original mask.
+  unsigned LowestBitSet = llvm::countr_zero(Imm);
+  unsigned LowestGapBitUnset =
+      LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+  // Create a mask for the least significant group of consecutive ones.
+  assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!");
+  T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+              (static_cast<T>(1) << LowestBitSet);
+  // Create a disjoint mask for the remaining ones.
+  T NewImm2 = Imm & ~NewImm1;
+
+  // Do not split if NewImm2 is not a valid bitmask immediate.
+  if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+    return false;
+
+  Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+  Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+  return true;
+}
+
+template <typename T>
 bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
                                               SplitStrategy Strategy,
                                               unsigned OtherOpc) {
-  // Try below transformation.
+  // Try below transformations.
   //
-  // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-  // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+  // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+  // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
   //
   // The mov pseudo instruction could be expanded to multiple mov instructions
   // later. Let's try to split the constant operand of mov instruction into two
-  // bitmask immediates. It makes only two AND instructions instead of multiple
-  // mov + and instructions.
+  // bitmask immediates based on the given split strategy. It makes only two
+  // logical instructions instead of multiple mov + logic instructions.
 
   return splitTwoPartImm<T>(
       MI,
@@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
         case SplitStrategy::Intersect:
           SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
           break;
+        case SplitStrategy::Disjoint:
+          SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+          break;
         }
         if (SplitSucc)
           return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
@@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
         Changed |= trySplitLogicalImm<uint64_t>(
             AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
         break;
+      case AArch64::EORWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::EORXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
       case AArch64::ORRWrs:
         Changed |= visitORR(MI);
         break;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index adc984a..1bc1d98 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove]>;
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
@@ -45,7 +46,8 @@ def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost
                                    ]>;
 
 def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
@@ -53,7 +55,8 @@ def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureUseWzrToVecMove]>;
+                                   FeatureUseWzrToVecMove,
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520AE ARM processors", [
@@ -756,7 +759,6 @@ def ProcessorFeatures {
                                  FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2,
                                  FeatureComplxNum, FeatureCRC, FeatureDotProd,
                                  FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE,
-                                 FeatureUseFixedOverScalableIfEqualCost,
                                  FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC];
   list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
@@ -766,7 +768,6 @@ def ProcessorFeatures {
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
                                  FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS,
                                  FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM,
-                                 FeatureUseFixedOverScalableIfEqualCost,
                                  FeatureDotProd, FeatureFPAC];
   list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3955f2a..25ad9ec 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       default: {
         // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
         // us to  fold the constant into the cmp instruction.
-        RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+        RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT);
         CC = ISD::SETGE;
         break;
       }
@@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
     // fold the constant into the cmp instruction.
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
-      RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+      // Doing a "icmp ugt i16 65535, %0" comparison should have been converted
+      // already to something else. Assert to make sure this assumption holds.
+      assert((!C->isAllOnes()) && "integer overflow in comparison transform");
+      RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT);
       CC = ISD::SETUGE;
       break;
     }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index ffd900c..5153d24 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -56,6 +56,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_wave_reduce_sum:
   case Intrinsic::dx_wave_reduce_umax:
   case Intrinsic::dx_wave_reduce_usum:
+  case Intrinsic::dx_imad:
+  case Intrinsic::dx_umad:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9003ace..d4f0cc9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4046,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
+  case Intrinsic::nvvm_prefetch_tensormap: {
+    auto &DL = I.getDataLayout();
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = getPointerTy(DL);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.flags =
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
+    Info.align.reset();
+    return true;
+  }
+
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d337192..d4a0ca7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -39,6 +39,12 @@ def AS_match {
   code global = [{
    return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
   }];
+  code const = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
+  }];
+  code param = [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
+  }];
 }
 
 
@@ -950,33 +956,47 @@ foreach dim = 3...5 in {
 defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
                                      [hasTMACTAGroupSupport]>;
 
-//Prefetch and Prefetchu 
-
-let Predicates = [hasPTX<80>, hasSM<90>] in {
-  class PREFETCH_INTRS<string InstName> :
-            BasicNVPTXInst<(outs), (ins ADDR:$addr),
-            InstName,
-            [(!cast<Intrinsic>(!strconcat("int_nvvm_",
-            !subst(".", "_", InstName))) addr:$addr)]>;
+//Prefetchu and Prefetch
 
-  def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">;
-  def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">;
-  def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">;
-  def PREFETCH_LOCAL_L1  : PREFETCH_INTRS<"prefetch.local.L1">;
-  def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
-  def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
+defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr);
 
-  def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
-                                        "prefetch.global.L2::evict_normal",
-                                        [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>;
+multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> {
+  def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>;
+}
 
-  def PREFETCH_GLOBAL_L2_EVICT_LAST   : BasicNVPTXInst<(outs), (ins ADDR:$addr),
-                                        "prefetch.global.L2::evict_last",
-                                        [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>;
+defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>;
 
-  def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
+multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> {
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+           "prefetch" # addrspace_name # ".tensormap",
+           [(pattern_frag addr:$addr)]>,
+           Requires<[hasPTX<80>, hasSM<90>]>;
 }
 
+defm PREFETCH_CONST_TENSORMAP   : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>;
+defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>;
+defm PREFETCH_PARAM_TENSORMAP   : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>;
+  
+class PREFETCH_INTRS<string InstName, Intrinsic Intr> :
+          BasicNVPTXInst<(outs), (ins ADDR:$addr),
+          InstName,
+          [(Intr addr:$addr)]>,
+          Requires<[hasPTX<80>, hasSM<90>]>;
+
+def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>;   
+def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>;
+def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>;
+def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>;
+def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>;
+def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>;
+def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>;
+def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", 
+                                      int_nvvm_prefetch_global_L2_evict_normal>;
+def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", 
+                                    int_nvvm_prefetch_global_L2_evict_last>;
+
 //Applypriority intrinsics
 class APPLYPRIORITY_L2_INTRS<string addrspace> :
           BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size),
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 3ae2d9d..f4f8961 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   case Intrinsic::nvvm_isspacep_global:
   case Intrinsic::nvvm_isspacep_local:
   case Intrinsic::nvvm_isspacep_shared:
-  case Intrinsic::nvvm_isspacep_shared_cluster: {
+  case Intrinsic::nvvm_isspacep_shared_cluster:
+  case Intrinsic::nvvm_prefetch_tensormap: {
     OpIndexes.push_back(0);
     return true;
   }
@@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
       return ConstantInt::get(II->getType(), *R);
     return nullptr;
   }
+  case Intrinsic::nvvm_prefetch_tensormap: {
+    IRBuilder<> Builder(II);
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
+                                        NewV);
+  }
   }
   return nullptr;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index bf23812..5541506 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,78 +13,113 @@
 //
 //===----------------------------------------------------------------------===//
 
-class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
-  string LLMUL = LargestLMUL<MxList>.r;
-  bit c = !eq(mx, LLMUL);
-}
+//===----------------------------------------------------------------------===//
+// Helpers
+
+// Maps LMUL string to corresponding value from the Values array
+// LMUL values map to array indices as follows:
+//   MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3],
+//   M2 -> Values[4], M4 -> Values[5], M8 -> Values[6]
+// Shorter lists are allowed, e.g., widening instructions don't work on M8
+class GetLMULValue<list<int> Values, string LMUL> {
+  defvar Index = !cond(
+    !eq(LMUL, "MF8"): 0,
+    !eq(LMUL, "MF4"): 1,
+    !eq(LMUL, "MF2"): 2,
+    !eq(LMUL, "M1"):  3,
+    !eq(LMUL, "M2"):  4,
+    !eq(LMUL, "M4"):  5,
+    !eq(LMUL, "M8"):  6,
+  );
 
-class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
-  string LLMUL = LargestLMUL<MxList>.r;
-  int SSEW = SmallestSEW<mx, isF>.r;
-  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+  assert !lt(Index, !size(Values)),
+    "Missing LMUL value for '" # LMUL # "'. " #
+    "Expected at least " # !add(Index, 1) # " elements, but got " #
+    !size(Values) # ".";
+
+  int c = Values[Index];
 }
 
-defvar SMX60VLEN = 256;
-defvar SMX60DLEN = !div(SMX60VLEN, 2);
+// Returns BaseValue for LMUL values before startLMUL, Value for startLMUL,
+// then doubles Value for each subsequent LMUL
+// Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns:
+//   MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32
+// This is useful for modeling scheduling parameters that scale with LMUL.
+class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> {
+  assert !le(BaseValue, Value), "BaseValue must be less-equal to Value";
+  defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c;
+  defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c;
 
-class Get1248Latency<string mx> {
+  // Calculate the difference in positions
+  defvar posDiff = !sub(currentPos, startPos);
+
+  // Calculate Value * (2^posDiff)
   int c = !cond(
-    !eq(mx, "M2") : 2,
-    !eq(mx, "M4") : 4,
-    !eq(mx, "M8") : 8,
-    true: 1
+    !eq(posDiff, 0) : Value,
+    !eq(posDiff, 1) : !mul(Value, 2),
+    !eq(posDiff, 2) : !mul(Value, 4),
+    !eq(posDiff, 3) : !mul(Value, 8),
+    !eq(posDiff, 4) : !mul(Value, 16),
+    !eq(posDiff, 5) : !mul(Value, 32),
+    !eq(posDiff, 6) : !mul(Value, 64),
+    true : BaseValue
   );
 }
 
-// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
-class Get4816Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M4") : 8,
-    !eq(mx, "M8") : 16,
-    true: 4
-  );
+// Same as the previous function but BaseValue == Value
+class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> {
+  int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32
+class ConstOneUntilMF4ThenDouble<string mx> {
+  int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16
+class ConstOneUntilMF2ThenDouble<string mx> {
+  int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c;
+}
+
+// Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8
+class ConstOneUntilM1ThenDouble<string mx> {
+  int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c;
 }
 
+//===----------------------------------------------------------------------===//
+// Latency helper classes
+
 // Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
-class Get458Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M4") : 5,
-    !eq(mx, "M8") : 8,
-    true: 4
-  );
+class Get4458Latency<string mx> {
+  int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c;
 }
 
-// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
-// Used for: widening operations
+// Used for: widening operations (no M8)
 class Get4588Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M2") : 5,
-    !eq(mx, "M4") : 8,
-    !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
-    true: 4
-  );
+  int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c;
 }
 
 // Used for: mask-producing comparisons, carry ops with mask, FP comparisons
 class Get461018Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M2") : 6,
-    !eq(mx, "M4") : 10,
-    !eq(mx, "M8") : 18,
-    true: 4
-  );
+  int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
 }
 
-// Used for: e64 multiply pattern, complex ops
-class Get781632Latency<string mx> {
-  int c = !cond(
-    !eq(mx, "M2") : 8,
-    !eq(mx, "M4") : 16,
-    !eq(mx, "M8") : 32,
-    true: 7
-  );
+//===----------------------------------------------------------------------===//
+
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
 }
 
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
 def SpacemitX60Model : SchedMachineModel {
   let IssueWidth        = 2; // dual-issue
   let MicroOpBufferSize = 0; // in-order
@@ -383,12 +418,13 @@ foreach LMul = [1, 2, 4, 8] in {
 foreach mx = SchedMxList in {
   defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
 
-  let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+  let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in {
     defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
   }
 
-  let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+  defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+  let Latency = VIALULat, ReleaseAtCycles = [4] in {
     // Pattern of vadd, vsub, vrsub: 4/4/5/8
     // Pattern of vand, vor, vxor:   4/4/8/16
     // They are grouped together, so we used the worst case 4/4/8/16
@@ -425,7 +461,7 @@ foreach mx = SchedMxList in {
   // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
   // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
   // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
-  let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+  let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in {
     defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
@@ -461,15 +497,8 @@ foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
 
-    // Slightly reduced for fractional LMULs
-    defvar Multiplier = !cond(
-      !eq(mx, "MF8") : 12,
-      !eq(mx, "MF4") : 12,
-      !eq(mx, "MF2") : 12,
-      true: 24
-    );
-
-    let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+    defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
+    let Latency = VIDivLat, ReleaseAtCycles = [12] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
     }
@@ -480,14 +509,8 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxListW in {
   defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
 
-  // Slightly increased for integer LMULs
-  defvar Multiplier = !cond(
-    !eq(mx, "M2") : 2,
-    !eq(mx, "M4") : 2,
-    true: 1
-  );
-
-  let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+  defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
+  let Latency = VNarrowingLat, ReleaseAtCycles = [4] in {
     defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 46b5673..4edf25c 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -789,6 +789,13 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
   bool NeedsMemMove = false;
   IRBuilder<> IRB(BB, IP);
 
+  auto GetAllocaSize = [&](AllocaInst *AI) {
+    return IRB.CreateMul(
+        IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
+        ConstantInt::get(IntptrTy,
+                         DL.getTypeAllocSize(AI->getAllocatedType())));
+  };
+
   if (auto *A = dyn_cast<Argument>(V)) {
     assert(A->hasByValAttr() && "Type reset for non-byval argument?");
 
@@ -811,7 +818,11 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
         }
       }
     } else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) {
-      Size = II->getArgOperand(0);
+      auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(1));
+      if (!AI)
+        return false;
+
+      Size = GetAllocaSize(AI);
       Dest = II->getArgOperand(1);
     } else if (auto *AI = dyn_cast<AllocaInst>(I)) {
       // We need to clear the types for new stack allocations (or else we might
@@ -820,10 +831,7 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
       IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(I)));
       IRB.SetInstDebugLocation(I);
 
-      Size = IRB.CreateMul(
-          IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
-          ConstantInt::get(IntptrTy,
-                           DL.getTypeAllocSize(AI->getAllocatedType())));
+      Size = GetAllocaSize(AI);
       Dest = I;
     } else {
       return false;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5d0e2f9..39011e7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3883,6 +3883,7 @@ private:
     enum CombinedOpcode {
       NotCombinedOp = -1,
       MinMax = Instruction::OtherOpsEnd + 1,
+      FMulAdd,
     };
     CombinedOpcode CombinedOp = NotCombinedOp;
 
@@ -4033,6 +4034,9 @@ private:
     /// Returns true if any scalar in the list is a copyable element.
     bool hasCopyableElements() const { return !CopyableElements.empty(); }
 
+    /// Returns the state of the operations.
+    const InstructionsState &getOperations() const { return S; }
+
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -11987,6 +11991,81 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
   }
 }
 
+static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
+                                       const InstructionsState &S,
+                                       DominatorTree &DT, const DataLayout &DL,
+                                       TargetTransformInfo &TTI,
+                                       const TargetLibraryInfo &TLI) {
+  assert(all_of(VL,
+                [](Value *V) {
+                  return V->getType()->getScalarType()->isFloatingPointTy();
+                }) &&
+         "Can only convert to FMA for floating point types");
+  assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
+
+  auto CheckForContractable = [&](ArrayRef<Value *> VL) {
+    FastMathFlags FMF;
+    FMF.set();
+    for (Value *V : VL) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      // TODO: support for copyable elements.
+      Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
+      if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
+        continue;
+      if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+        FMF &= FPCI->getFastMathFlags();
+    }
+    return FMF.allowContract();
+  };
+  if (!CheckForContractable(VL))
+    return InstructionCost::getInvalid();
+  // fmul also should be contractable
+  InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+  SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
+
+  InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
+  if (!OpS.valid())
+    return InstructionCost::getInvalid();
+  if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
+    return InstructionCost::getInvalid();
+  if (!CheckForContractable(Operands.front()))
+    return InstructionCost::getInvalid();
+  // Compare the costs.
+  InstructionCost FMulPlusFAddCost = 0;
+  InstructionCost FMACost = 0;
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  FastMathFlags FMF;
+  FMF.set();
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+      FMF &= FPCI->getFastMathFlags();
+    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
+  }
+  unsigned NumOps = 0;
+  for (auto [V, Op] : zip(VL, Operands.front())) {
+    auto *I = dyn_cast<Instruction>(Op);
+    if (!I || !I->hasOneUse()) {
+      FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+      if (I)
+        FMACost += TTI.getInstructionCost(I, CostKind);
+      continue;
+    }
+    ++NumOps;
+    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+      FMF &= FPCI->getFastMathFlags();
+    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
+  }
+  Type *Ty = VL.front()->getType();
+  IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
+  FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
+  return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
+}
+
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   BaseGraphSize = VectorizableTree.size();
@@ -12355,6 +12434,25 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::FSub:
+    case Instruction::FAdd: {
+      // Check if possible to convert (a*b)+c to fma.
+      if (E.State != TreeEntry::Vectorize ||
+          !E.getOperations().isAddSubLikeOp())
+        break;
+      if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
+               .isValid())
+        break;
+      // This node is a fmuladd node.
+      E.CombinedOp = TreeEntry::FMulAdd;
+      TreeEntry *FMulEntry = getOperandEntry(&E, 0);
+      if (FMulEntry->UserTreeIndex &&
+          FMulEntry->State == TreeEntry::Vectorize) {
+        // The FMul node is part of the combined fmuladd node.
+        FMulEntry->State = TreeEntry::CombinedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -13587,6 +13685,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     }
     return IntrinsicCost;
   };
+  auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
+                                         Instruction *VI) {
+    InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
+    return Cost;
+  };
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -13927,6 +14030,30 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
+  case TreeEntry::FMulAdd: {
+    auto GetScalarCost = [&](unsigned Idx) {
+      if (isa<PoisonValue>(UniqueValues[Idx]))
+        return InstructionCost(TTI::TCC_Free);
+      return GetFMulAddCost(E->getOperations(),
+                            cast<Instruction>(UniqueValues[Idx]));
+    };
+    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
+      FastMathFlags FMF;
+      FMF.set();
+      for (Value *V : E->Scalars) {
+        if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
+          FMF &= FPCI->getFastMathFlags();
+          if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
+            FMF &= FPCIOp->getFastMathFlags();
+        }
+      }
+      IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
+                                  {VecTy, VecTy, VecTy}, FMF);
+      InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+      return VecCost + CommonCost;
+    };
+    return GetCostDiff(GetScalarCost, GetVectorCost);
+  }
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -13964,8 +14091,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       }
       TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);
       TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);
-      return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
-                                         Op1Info, Op2Info, Operands);
+      InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
+          ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
+      if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
+          I && (ShuffleOrOp == Instruction::FAdd ||
+                ShuffleOrOp == Instruction::FSub)) {
+        InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
+        if (IntrinsicCost.isValid())
+          ScalarCost = IntrinsicCost;
+      }
+      return ScalarCost;
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
@@ -22594,11 +22729,21 @@ public:
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
                                  ScalarEvolution &SE, const DataLayout &DL,
-                                 const TargetLibraryInfo &TLI) {
+                                 const TargetLibraryInfo &TLI,
+                                 DominatorTree &DT, TargetTransformInfo &TTI) {
     RdxKind = HorizontalReduction::getRdxKind(Root);
     if (!isVectorizable(RdxKind, Root))
       return false;
 
+    // FMA reduction root - skip.
+    auto CheckForFMA = [&](Instruction *I) {
+      return RdxKind == RecurKind::FAdd &&
+             canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
+                 .isValid();
+    };
+    if (CheckForFMA(Root))
+      return false;
+
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     Type *Ty = Root->getType();
@@ -22636,7 +22781,7 @@ public:
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
         if (!EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind ||
+            getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -24205,13 +24350,13 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   Stack.emplace(SelectRoot(), 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
-  auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
+  auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
     if (R.isAnalyzedReductionRoot(Inst))
       return nullptr;
     if (!isReductionCandidate(Inst))
       return nullptr;
     HorizontalReduction HorRdx;
-    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
       return nullptr;
     return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };
@@ -24277,6 +24422,12 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
 
   if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
     return false;
+  // Skip potential FMA candidates.
+  if ((I->getOpcode() == Instruction::FAdd ||
+       I->getOpcode() == Instruction::FSub) &&
+      canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
+          .isValid())
+    return false;
 
   Value *P = I->getParent();
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8052e31..73babcc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1054,12 +1054,17 @@ void VPlan::execute(VPTransformState *State) {
 
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   // For now only return the cost of the vector loop region, ignoring any other
-  // blocks, like the preheader or middle blocks.
+  // blocks, like the preheader or middle blocks, expect for checking them for
+  // recipes with invalid costs.
   InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
 
-  // If any instructions in the middle block are invalid return invalid.
-  // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created.
-  if (!getMiddleBlock()->cost(VF, Ctx).isValid())
+  // If the cost of the loop region is invalid or any recipe in the skeleton
+  // outside loop regions are invalid return an invalid cost.
+  if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
+                                    vp_depth_first_shallow(getEntry())),
+                                [&VF, &Ctx](VPBasicBlock *VPBB) {
+                                  return !VPBB->cost(VF, Ctx).isValid();
+                                }))
     return InstructionCost::getInvalid();
 
   return Cost;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
index 113eb14..4db9db9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
@@ -370,3 +370,175 @@ entry:
   %r = select i1 %c, i64 %a, i64 %ands
   ret i64 %r
 }
+
+; Test EOR.
+define i32 @test1_eor(i32 %a) {
+; CHECK-LABEL: test1_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor w8, w0, #0x400
+; CHECK-NEXT:    eor w0, w8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 2098176
+  ret i32 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_eor(i32 %a) {
+; CHECK-LABEL: test2_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 135
+  ret i32 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_eor(i32 %a) {
+; CHECK-LABEL: test3_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 2163712
+  ret i32 %eor
+}
+
+define i64 @test4_eor(i64 %a) {
+; CHECK-LABEL: test4_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x0, #0x400
+; CHECK-NEXT:    eor x0, x8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 2098176
+  ret i64 %eor
+}
+
+define i64 @test5_eor(i64 %a) {
+; CHECK-LABEL: test5_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x0, #0x4000
+; CHECK-NEXT:    eor x0, x8, #0x200000000
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 8589950976
+  ret i64 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_eor(i64 %a) {
+; CHECK-LABEL: test6_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 135
+  ret i64 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_eor(i64 %a) {
+; CHECK-LABEL: test7_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 2163712
+  ret i64 %eor
+}
+
+; Test ORR.
+define i32 @test1_orr(i32 %a) {
+; CHECK-LABEL: test1_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr w8, w0, #0x400
+; CHECK-NEXT:    orr w0, w8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 2098176
+  ret i32 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_orr(i32 %a) {
+; CHECK-LABEL: test2_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 135
+  ret i32 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_orr(i32 %a) {
+; CHECK-LABEL: test3_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 2163712
+  ret i32 %orr
+}
+
+define i64 @test4_orr(i64 %a) {
+; CHECK-LABEL: test4_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x0, #0x400
+; CHECK-NEXT:    orr x0, x8, #0x200000
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 2098176
+  ret i64 %orr
+}
+
+define i64 @test5_orr(i64 %a) {
+; CHECK-LABEL: test5_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x0, #0x4000
+; CHECK-NEXT:    orr x0, x8, #0x200000000
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 8589950976
+  ret i64 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_orr(i64 %a) {
+; CHECK-LABEL: test6_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 135
+  ret i64 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_orr(i64 %a) {
+; CHECK-LABEL: test7_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 2163712
+  ret i64 %orr
+}
diff --git a/llvm/test/CodeGen/AVR/cmp.ll b/llvm/test/CodeGen/AVR/cmp.ll
index efc9b8d..c932bda1 100644
--- a/llvm/test/CodeGen/AVR/cmp.ll
+++ b/llvm/test/CodeGen/AVR/cmp.ll
@@ -298,3 +298,18 @@ define i16 @cmp_i16_gt_1023(i16 %0) {
   %3 = zext i1 %2 to i16
   ret i16 %3
 }
+
+define void @cmp_issue152097(i16 %a) addrspace(1) {
+; See: https://github.com/llvm/llvm-project/issues/152097
+; CHECK-LABEL: cmp_issue152097
+; CHECK:      ldi r18, -1
+; CHECK-NEXT: cpi r24, -2
+; CHECK-NEXT: cpc r25, r18
+; CHECK-NEXT: ret
+  %cmp = icmp ugt i16 -2, %a
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  ret void
+if.else:
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/imad.ll b/llvm/test/CodeGen/DirectX/imad.ll
index 5d9463d..2e612f0 100644
--- a/llvm/test/CodeGen/DirectX/imad.ll
+++ b/llvm/test/CodeGen/DirectX/imad.ll
@@ -1,17 +1,13 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower < %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
-; CHECK:call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
-; CHECK:call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
-; CHECK:call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
-
-; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
 target triple = "dxil-pc-shadermodel6.7-library"
 ; Function Attrs: noinline nounwind optnone
 define noundef i16 @imad_short(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) #0 {
 entry:
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
   %p2.addr = alloca i16, align 2
   %p1.addr = alloca i16, align 2
   %p0.addr = alloca i16, align 2
@@ -31,6 +27,7 @@ declare i16 @llvm.dx.imad.i16(i16, i16, i16) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i32 @imad_int(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) #0 {
 entry:
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i32, align 4
   %p1.addr = alloca i32, align 4
   %p0.addr = alloca i32, align 4
@@ -50,6 +47,7 @@ declare i32 @llvm.dx.imad.i32(i32, i32, i32) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i64 @imad_int64(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) #0 {
 entry:
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i64, align 8
   %p1.addr = alloca i64, align 8
   %p0.addr = alloca i64, align 8
@@ -65,3 +63,95 @@ entry:
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn
 declare i64 @llvm.dx.imad.i64(i64, i64, i64) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i16> @imad_int16_t4(<4 x i16> noundef %p0, <4 x i16> noundef %p1, <4 x i16> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i16> %p0, i64 0
+  ; CHECK: extractelement <4 x i16> %p1, i64 0
+  ; CHECK: extractelement <4 x i16> %p2, i64 0
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 1
+  ; CHECK: extractelement <4 x i16> %p1, i64 1
+  ; CHECK: extractelement <4 x i16> %p2, i64 1
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 2
+  ; CHECK: extractelement <4 x i16> %p1, i64 2
+  ; CHECK: extractelement <4 x i16> %p2, i64 2
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 3
+  ; CHECK: extractelement <4 x i16> %p1, i64 3
+  ; CHECK: extractelement <4 x i16> %p2, i64 3
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 3
+  %dx.imad = call <4 x i16> @llvm.dx.imad.v4i16(<4 x i16> %p0, <4 x i16> %p1, <4 x i16> %p2)
+  ret <4 x i16> %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i16> @llvm.dx.imad.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i32> @imad_int4(<4 x i32> noundef %p0, <4 x i32> noundef %p1, <4 x i32> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i32> %p0, i64 0
+  ; CHECK: extractelement <4 x i32> %p1, i64 0
+  ; CHECK: extractelement <4 x i32> %p2, i64 0
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 1
+  ; CHECK: extractelement <4 x i32> %p1, i64 1
+  ; CHECK: extractelement <4 x i32> %p2, i64 1
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 2
+  ; CHECK: extractelement <4 x i32> %p1, i64 2
+  ; CHECK: extractelement <4 x i32> %p2, i64 2
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 3
+  ; CHECK: extractelement <4 x i32> %p1, i64 3
+  ; CHECK: extractelement <4 x i32> %p2, i64 3
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 3
+  %dx.imad = call <4 x i32> @llvm.dx.imad.v4i32(<4 x i32> %p0, <4 x i32> %p1, <4 x i32> %p2)
+  ret <4 x i32> %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i32> @llvm.dx.imad.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i64> @imad_int64_t4(<4 x i64> noundef %p0, <4 x i64> noundef %p1, <4 x i64> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i64> %p0, i64 0
+  ; CHECK: extractelement <4 x i64> %p1, i64 0
+  ; CHECK: extractelement <4 x i64> %p2, i64 0
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 1
+  ; CHECK: extractelement <4 x i64> %p1, i64 1
+  ; CHECK: extractelement <4 x i64> %p2, i64 1
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 2
+  ; CHECK: extractelement <4 x i64> %p1, i64 2
+  ; CHECK: extractelement <4 x i64> %p2, i64 2
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 3
+  ; CHECK: extractelement <4 x i64> %p1, i64 3
+  ; CHECK: extractelement <4 x i64> %p2, i64 3
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 3
+  %dx.imad = call <4 x i64> @llvm.dx.imad.v4i64(<4 x i64> %p0, <4 x i64> %p1, <4 x i64> %p2)
+  ret <4 x i64> %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i64> @llvm.dx.imad.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) #1
+
+; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
diff --git a/llvm/test/CodeGen/DirectX/umad.ll b/llvm/test/CodeGen/DirectX/umad.ll
index 104d238..76516a2 100644
--- a/llvm/test/CodeGen/DirectX/umad.ll
+++ b/llvm/test/CodeGen/DirectX/umad.ll
@@ -1,17 +1,13 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower < %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
-; CHECK:call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
-; CHECK:call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
-; CHECK:call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
-
-; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
 target triple = "dxil-pc-shadermodel6.7-library"
 ; Function Attrs: noinline nounwind optnone
 define noundef i16 @umad_ushort(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) #0 {
 entry:
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR:]]
   %p2.addr = alloca i16, align 2
   %p1.addr = alloca i16, align 2
   %p0.addr = alloca i16, align 2
@@ -31,6 +27,7 @@ declare i16 @llvm.dx.umad.i16(i16, i16, i16) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i32 @umad_uint(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) #0 {
 entry:
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i32, align 4
   %p1.addr = alloca i32, align 4
   %p0.addr = alloca i32, align 4
@@ -50,6 +47,7 @@ declare i32 @llvm.dx.umad.i32(i32, i32, i32) #1
 ; Function Attrs: noinline nounwind optnone
 define noundef i64 @umad_uint64(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) #0 {
 entry:
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
   %p2.addr = alloca i64, align 8
   %p1.addr = alloca i64, align 8
   %p0.addr = alloca i64, align 8
@@ -65,3 +63,95 @@ entry:
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn
 declare i64 @llvm.dx.umad.i64(i64, i64, i64) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i16> @umad_uint16_t4(<4 x i16> noundef %p0, <4 x i16> noundef %p1, <4 x i16> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i16> %p0, i64 0
+  ; CHECK: extractelement <4 x i16> %p1, i64 0
+  ; CHECK: extractelement <4 x i16> %p2, i64 0
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 1
+  ; CHECK: extractelement <4 x i16> %p1, i64 1
+  ; CHECK: extractelement <4 x i16> %p2, i64 1
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 2
+  ; CHECK: extractelement <4 x i16> %p1, i64 2
+  ; CHECK: extractelement <4 x i16> %p2, i64 2
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i16> %p0, i64 3
+  ; CHECK: extractelement <4 x i16> %p1, i64 3
+  ; CHECK: extractelement <4 x i16> %p2, i64 3
+  ; CHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i16> %{{.*}}, i16 %{{.*}}, i64 3
+  %dx.umad = call <4 x i16> @llvm.dx.umad.v4i16(<4 x i16> %p0, <4 x i16> %p1, <4 x i16> %p2)
+  ret <4 x i16> %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i16> @llvm.dx.umad.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i32> @umad_uint4(<4 x i32> noundef %p0, <4 x i32> noundef %p1, <4 x i32> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i32> %p0, i64 0
+  ; CHECK: extractelement <4 x i32> %p1, i64 0
+  ; CHECK: extractelement <4 x i32> %p2, i64 0
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 1
+  ; CHECK: extractelement <4 x i32> %p1, i64 1
+  ; CHECK: extractelement <4 x i32> %p2, i64 1
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 2
+  ; CHECK: extractelement <4 x i32> %p1, i64 2
+  ; CHECK: extractelement <4 x i32> %p2, i64 2
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i32> %p0, i64 3
+  ; CHECK: extractelement <4 x i32> %p1, i64 3
+  ; CHECK: extractelement <4 x i32> %p2, i64 3
+  ; CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i64 3
+  %dx.umad = call <4 x i32> @llvm.dx.umad.v4i32(<4 x i32> %p0, <4 x i32> %p1, <4 x i32> %p2)
+  ret <4 x i32> %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i32> @llvm.dx.umad.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef <4 x i64> @umad_uint64_t4(<4 x i64> noundef %p0, <4 x i64> noundef %p1, <4 x i64> noundef %p2) #0 {
+entry:
+  ; CHECK: extractelement <4 x i64> %p0, i64 0
+  ; CHECK: extractelement <4 x i64> %p1, i64 0
+  ; CHECK: extractelement <4 x i64> %p2, i64 0
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 1
+  ; CHECK: extractelement <4 x i64> %p1, i64 1
+  ; CHECK: extractelement <4 x i64> %p2, i64 1
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 2
+  ; CHECK: extractelement <4 x i64> %p1, i64 2
+  ; CHECK: extractelement <4 x i64> %p2, i64 2
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: extractelement <4 x i64> %p0, i64 3
+  ; CHECK: extractelement <4 x i64> %p1, i64 3
+  ; CHECK: extractelement <4 x i64> %p2, i64 3
+  ; CHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) #[[#ATTR]]
+  ; CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 1
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 2
+  ; CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i64 3
+  %dx.umad = call <4 x i64> @llvm.dx.umad.v4i64(<4 x i64> %p0, <4 x i64> %p1, <4 x i64> %p2)
+  ret <4 x i64> %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare <4 x i64> @llvm.dx.umad.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) #1
+
+; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}}
diff --git a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
new file mode 100644
index 0000000..3efe9be
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX
+; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+@constant_tensormap = addrspace(4) global [64 x i8] zeroinitializer, align 64
+
+; Inference from const address space
+define void @test_infer_const_from_cast() {
+; INFER-LABEL: @test_infer_const_from_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; PTX-LABEL: .visible .func test_infer_const_from_cast(
+; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
+; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+  %casted = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %casted)
+  ret void
+}
+
+; Cast from Const space to Generic
+define void @test_const_to_generic_cast(ptr addrspace(4) %const_ptr) {
+; INFER-LABEL: @test_const_to_generic_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+; PTX-LABEL: .visible .func test_const_to_generic_cast(
+; PTX: prefetch.const.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast = addrspacecast ptr addrspace(4) %const_ptr to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
+  ret void
+}
+
+; No inference possible 
+define void @test_no_inference_possible(ptr %generic_ptr) {
+; INFER-LABEL: @test_no_inference_possible
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
+; PTX-LABEL: .visible .func test_no_inference_possible(
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}]; 
+entry:
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
+  ret void
+}
+
+; Cast from Parameter space to Generic
+define void @test_param_to_generic_cast(ptr addrspace(101) %param_ptr) {
+; INFER-LABEL: @test_param_to_generic_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+; PTX-LABEL: .visible .func test_param_to_generic_cast(
+; PTX: prefetch.param.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast = addrspacecast ptr addrspace(101) %param_ptr to ptr
+  call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
+  ret void
+}
+
+; Multiple casts in sequence
+define void @test_infer_through_multiple_casts() {
+; INFER-LABEL: @test_infer_through_multiple_casts
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; PTX-LABEL: .visible .func test_infer_through_multiple_casts(
+; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
+; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+  %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
+  %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4)
+  %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr
+  call void @llvm.nvvm.prefetch.tensormap(ptr %cast3)
+  ret void
+}
+
+declare void @llvm.nvvm.prefetch.tensormap.p0(ptr)
+declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4))
+declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101))
+
+
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll
index a64e4fe..862e26d 100644
--- a/llvm/test/CodeGen/NVPTX/prefetch.ll
+++ b/llvm/test/CodeGen/NVPTX/prefetch.ll
@@ -12,6 +12,10 @@ declare void  @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %local_ptr)
 declare void  @llvm.nvvm.prefetch.L1(ptr %ptr)
 declare void  @llvm.nvvm.prefetch.L2(ptr %ptr)
 
+declare void  @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+declare void  @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+declare void  @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+
 declare void  @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
 declare void  @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)
 
@@ -78,4 +82,43 @@ define void @prefetchu_l1(ptr %ptr) {
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr)
   ret void
+}
+
+define void @prefetch_tensormap(ptr %ptr) {
+; CHECK-PTX64-LABEL: prefetch_tensormap(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_tensormap_param_0];
+; CHECK-PTX64-NEXT:    prefetch.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+  ret void
+}
+
+define void @prefetch_const_tensormap(ptr addrspace(4) %const_ptr) {
+; CHECK-PTX64-LABEL: prefetch_const_tensormap(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_const_tensormap_param_0];
+; CHECK-PTX64-NEXT:    prefetch.const.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+  ret void
+}
+
+define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) {
+; CHECK-PTX64-LABEL: prefetch_param_tensormap(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_param_tensormap_param_0];
+; CHECK-PTX64-NEXT:    prefetch.param.tensormap [%rd1];
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+  ret void
 }
 \ No newline at end of file
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
index c53b006..fc72631 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
@@ -74,3 +74,56 @@ loop:
 exit:
   ret void
 }
+
+define void @dynamic_alloca_lifetime_test(i1 %c, i64 %n) sanitize_type {
+; CHECK-LABEL: @dynamic_alloca_lifetime_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, i64 [[N:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 0, i64 [[TMP6]], i1 false)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP7]], 3
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP12]], i8 0, i64 [[TMP13]], i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[X]])
+; CHECK-NEXT:    call void @alloca_test_use(ptr [[X]])
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[N]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP15]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP16]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP20:%.*]] = shl i64 [[TMP14]], 3
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP19]], i8 0, i64 [[TMP20]], i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[X]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x = alloca i32, i64 %n, align 1
+  br label %loop
+
+loop:
+  call void @llvm.lifetime.start.p0(i64 -1, ptr %x)
+  call void @alloca_test_use(ptr %x)
+  call void @llvm.lifetime.end.p0(i64 -1, ptr %x)
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/MC/ELF/many-instructions.s b/llvm/test/MC/ELF/many-instructions.s
index 843d35f..7c13c0d 100644
--- a/llvm/test/MC/ELF/many-instructions.s
+++ b/llvm/test/MC/ELF/many-instructions.s
@@ -1,4 +1,5 @@
-# REQUIRES: asserts
+## Checks the size of an internal MC structure that is different on 32-bit.
+# REQUIRES: asserts, llvm-64-bits
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null -debug-only=mc-dump 2>&1 | grep -E -o '[0-9]+ Data Size:[0-9]+' | FileCheck %s
 
 ## Test that encodeInstruction may cause a new fragment to be created.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
index 8495dee..b4df63d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
@@ -1,47 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 5
-; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -mtriple=aarch64 -mattr=+sve -S \
-; RUN:   -debug-only=loop-vectorize %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64 -mattr=+sve -S %s | FileCheck %s
 
-; FIXME: Hoisted vector code should be costed with scalable cost.
-; In this example, `<vscale x 4 x float> @llvm.minimumnum` has an invalid cost,
-; and hence should not be produced by LoopVectorize.
-
-; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction:   %res = tail call float @llvm.minimumnum.f32(float %arg, float 0.000000e+00)
 define void @cost_hoisted_vector_code(ptr %p, float %arg) {
 ; CHECK-LABEL: define void @cost_hoisted_vector_code(
 ; CHECK-SAME: ptr [[P:%.*]], float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 -1, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 -1, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[ARG]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 1, [[N_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[BROADCAST_SPLAT]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[ARG]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = add i64 1, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[TMP10]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP7]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP7]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP8]], i32 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], -8
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 -1, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
index 20bc0af..76a7536 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a510 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA510
 ; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a520 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA520
+; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a320 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA320
 
 define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA510-LABEL: define void @sve_add(
@@ -131,6 +132,70 @@ define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA520:       [[FOR_COND_CLEANUP]]:
 ; CHECK-CA520-NEXT:    ret void
 ;
+; CHECK-CA320-LABEL: define void @sve_add(
+; CHECK-CA320-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-CA320-NEXT:  [[ENTRY:.*:]]
+; CHECK-CA320-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-CA320-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-CA320-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-CA320-NEXT:    [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-CA320-NEXT:    br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK-CA320:       [[FOR_BODY_PREHEADER]]:
+; CHECK-CA320-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-CA320-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-CA320:       [[VECTOR_MEMCHECK]]:
+; CHECK-CA320-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]]
+; CHECK-CA320-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; CHECK-CA320-NEXT:    [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]]
+; CHECK-CA320-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
+; CHECK-CA320-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; CHECK-CA320-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-CA320:       [[VECTOR_PH]]:
+; CHECK-CA320-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-CA320-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-CA320-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-CA320:       [[VECTOR_BODY]]:
+; CHECK-CA320-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-CA320-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-CA320-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-CA320-NEXT:    [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]]
+; CHECK-CA320-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]]
+; CHECK-CA320-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4
+; CHECK-CA320-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-CA320-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-CA320-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-CA320-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-CA320-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-CA320:       [[MIDDLE_BLOCK]]:
+; CHECK-CA320-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-CA320-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-CA320:       [[SCALAR_PH]]:
+; CHECK-CA320-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-CA320-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-CA320:       [[FOR_BODY]]:
+; CHECK-CA320-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-CA320-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-CA320-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-CA320-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP12]], [[TMP11]]
+; CHECK-CA320-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-CA320-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-CA320-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-CA320-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-CA320:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-CA320-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-CA320:       [[FOR_COND_CLEANUP]]:
+; CHECK-CA320-NEXT:    ret void
+;
 entry:
   %cmp9.not = icmp eq i64 %n, 0
   br i1 %cmp9.not, label %for.cond.cleanup, label %for.body
@@ -160,3 +225,8 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 ; CHECK-CA520: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-CA520: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ;.
+; CHECK-CA320: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-CA320: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-CA320: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-CA320: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index ce7b78e..2b01018 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -1,81 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 5
 ; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
-; CHECK-LABEL: @trip7_i64(
-; CHECK:         = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    = mul nuw i64
-; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[VF:%.*]] = mul nuw i64 [[VSCALE]], 2
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ]
-; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; CHECK:         call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> {{%.*}}, ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
+; CHECK-LABEL: define void @trip7_i64(
+; CHECK-SAME: ptr noalias noundef captures(none) [[DST:%.*]], ptr noalias noundef readonly captures(none) [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 7, [[TMP2]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 7)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP6]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7)
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NOT:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[COND:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NOT]], i32 0
-; CHECK-NEXT:    br i1 [[COND]], label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                         ; preds = %entry, %for.body
-  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i64, ptr %src, i64 %i.06
-  %0 = load i64, ptr %arrayidx, align 8
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %0 = load i64, ptr %gep.src, align 8
   %mul = shl nsw i64 %0, 1
-  %arrayidx1 = getelementptr inbounds i64, ptr %dst, i64 %i.06
-  %1 = load i64, ptr %arrayidx1, align 8
+  %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
+  %1 = load i64, ptr %gep.dst, align 8
   %add = add nsw i64 %1, %mul
-  store i64 %add, ptr %arrayidx1, align 8
-  %inc = add nuw nsw i64 %i.06, 1
-  %exitcond.not = icmp eq i64 %inc, 7
-  br i1 %exitcond.not, label %for.end, label %for.body
+  store i64 %add, ptr %gep.dst, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 7
+  br i1 %ec, label %exit, label %loop
 
-for.end:                                          ; preds = %for.body
+exit:
   ret void
 }
 
 define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
-; CHECK-LABEL: @trip5_i8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-LABEL: define void @trip5_i8(
+; CHECK-SAME: ptr noalias noundef captures(none) [[DST:%.*]], ptr noalias noundef readonly captures(none) [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[GEP_SRC]], align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 5
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                         ; preds = %entry, %for.body
-  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
-  %0 = load i8, ptr %arrayidx, align 1
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
+  %0 = load i8, ptr %gep.src, align 1
   %mul = shl i8 %0, 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
-  %1 = load i8, ptr %arrayidx1, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
+  %1 = load i8, ptr %gep.dst, align 1
   %add = add i8 %mul, %1
-  store i8 %add, ptr %arrayidx1, align 1
-  %inc = add nuw nsw i64 %i.08, 1
-  %exitcond.not = icmp eq i64 %inc, 5
-  br i1 %exitcond.not, label %for.end, label %for.body
+  store i8 %add, ptr %gep.dst, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
 
-for.end:                                          ; preds = %for.body
+exit:
   ret void
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 9c910d7..10d83a4 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -324,6 +324,56 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.exp2.f64(double)
 
+define void @ldexp_f32i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f32i32(
+; CHECK: llvm.ldexp.v4f32.v4i32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.ldexp.f32.i32(float %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i32 %iv
+  store float %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+define void @ldexp_f64i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f64i32(
+; CHECK: llvm.ldexp.v4f64.v4i32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call double @llvm.ldexp.f64.i32(double %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i32 %iv
+  store double %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.ldexp.f64.i32(double, i32)
+
 define void @log_f32(i32 %n, ptr %y, ptr %x) {
 ; CHECK-LABEL: @log_f32(
 ; CHECK: llvm.log.v4f32
@@ -976,6 +1026,157 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.roundeven.f64(double)
 
+
+define void @lround_i32f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f32(
+; CHECK: llvm.lround.v4i32.v4f32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i32 @llvm.lround.i32.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
+define void @lround_i32f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f64(
+; CHECK: llvm.lround.v4i32.v4f64
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i32 @llvm.lround.i32.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
+define void @lround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f32(
+; CHECK: llvm.lround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.lround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f32(float)
+
+define void @lround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f64(
+; CHECK: llvm.lround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.lround.i64.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f64(double)
+
+define void @llround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f32(
+; CHECK: llvm.llround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.llround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f32(float)
+
+define void @llround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f64(
+; CHECK: llvm.llround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.llround.i64.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f64(double)
+
 define void @fma_f32(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
 ; CHECK-LABEL: @fma_f32(
 ; CHECK: llvm.fma.v4f32
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 4427699..9e086dca 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -8,15 +8,18 @@ target triple = "aarch64--linux-gnu"
 define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
+; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL11]], [[MUL12]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -47,15 +50,18 @@ for.end27:
 define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
+; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/exp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/exp.ll
new file mode 100644
index 0000000..301e5da
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/exp.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @ldexp_f32i32(ptr %x, ptr %y, i32 %exp) {
+; CHECK-LABEL: @ldexp_f32i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L0]], i32 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L2]], i32 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L4]], i32 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L6]], i32 [[EXP]])
+; CHECK-NEXT:    store float [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 1
+; CHECK-NEXT:    store float [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 2
+; CHECK-NEXT:    store float [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 3
+; CHECK-NEXT:    store float [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call float @llvm.ldexp.f32.i32(float %l0, i32 %exp)
+  %l3 = tail call float @llvm.ldexp.f32.i32(float %l2, i32 %exp)
+  %l5 = tail call float @llvm.ldexp.f32.i32(float %l4, i32 %exp)
+  %l7 = tail call float @llvm.ldexp.f32.i32(float %l6, i32 %exp)
+  store float %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds float, ptr %y, i64 1
+  store float %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, ptr %y, i64 2
+  store float %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, ptr %y, i64 3
+  store float %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f64i32(ptr %x, ptr %y, i32 %exp) {
+; CHECK-LABEL: @ldexp_f64i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L0]], i32 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L2]], i32 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L4]], i32 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L6]], i32 [[EXP]])
+; CHECK-NEXT:    store double [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 1
+; CHECK-NEXT:    store double [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 2
+; CHECK-NEXT:    store double [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 3
+; CHECK-NEXT:    store double [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call double @llvm.ldexp.f64.i32(double %l0, i32 %exp)
+  %l3 = tail call double @llvm.ldexp.f64.i32(double %l2, i32 %exp)
+  %l5 = tail call double @llvm.ldexp.f64.i32(double %l4, i32 %exp)
+  %l7 = tail call double @llvm.ldexp.f64.i32(double %l6, i32 %exp)
+  store double %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds double, ptr %y, i64 1
+  store double %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds double, ptr %y, i64 2
+  store double %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds double, ptr %y, i64 3
+  store double %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f32i64(ptr %x, ptr %y, i64 %exp) {
+; CHECK-LABEL: @ldexp_f32i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L0]], i64 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L2]], i64 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L4]], i64 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L6]], i64 [[EXP]])
+; CHECK-NEXT:    store float [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 1
+; CHECK-NEXT:    store float [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 2
+; CHECK-NEXT:    store float [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 3
+; CHECK-NEXT:    store float [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call float @llvm.ldexp.f32.i64(float %l0, i64 %exp)
+  %l3 = tail call float @llvm.ldexp.f32.i64(float %l2, i64 %exp)
+  %l5 = tail call float @llvm.ldexp.f32.i64(float %l4, i64 %exp)
+  %l7 = tail call float @llvm.ldexp.f32.i64(float %l6, i64 %exp)
+  store float %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds float, ptr %y, i64 1
+  store float %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, ptr %y, i64 2
+  store float %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, ptr %y, i64 3
+  store float %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f64i64(ptr %x, ptr %y, i64 %exp) {
+; CHECK-LABEL: @ldexp_f64i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L0]], i64 [[EXP:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L2]], i64 [[EXP]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L4]], i64 [[EXP]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L6]], i64 [[EXP]])
+; CHECK-NEXT:    store double [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 1
+; CHECK-NEXT:    store double [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 2
+; CHECK-NEXT:    store double [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 3
+; CHECK-NEXT:    store double [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call double @llvm.ldexp.f64.i64(double %l0, i64 %exp)
+  %l3 = tail call double @llvm.ldexp.f64.i64(double %l2, i64 %exp)
+  %l5 = tail call double @llvm.ldexp.f64.i64(double %l4, i64 %exp)
+  %l7 = tail call double @llvm.ldexp.f64.i64(double %l6, i64 %exp)
+  store double %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds double, ptr %y, i64 1
+  store double %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds double, ptr %y, i64 2
+  store double %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds double, ptr %y, i64 3
+  store double %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f32i32_i64(ptr %x, ptr %y, i32 %exp32, i64 %exp64) {
+; CHECK-LABEL: @ldexp_f32i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L0]], i32 [[EXP32:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call float @llvm.ldexp.f32.i32(float [[L2]], i32 [[EXP32]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L4]], i64 [[EXP64:%.*]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call float @llvm.ldexp.f32.i64(float [[L6]], i64 [[EXP64]])
+; CHECK-NEXT:    store float [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 1
+; CHECK-NEXT:    store float [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 2
+; CHECK-NEXT:    store float [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 3
+; CHECK-NEXT:    store float [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call float @llvm.ldexp.f32.i32(float %l0, i32 %exp32)
+  %l3 = tail call float @llvm.ldexp.f32.i32(float %l2, i32 %exp32)
+  %l5 = tail call float @llvm.ldexp.f32.i64(float %l4, i64 %exp64)
+  %l7 = tail call float @llvm.ldexp.f32.i64(float %l6, i64 %exp64)
+  store float %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds float, ptr %y, i64 1
+  store float %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, ptr %y, i64 2
+  store float %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, ptr %y, i64 3
+  store float %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @ldexp_f64_i32_i64(ptr %x, ptr %y, i32 %exp32, i64 %exp64) {
+; CHECK-LABEL: @ldexp_f64_i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L0]], i32 [[EXP32:%.*]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call double @llvm.ldexp.f64.i32(double [[L2]], i32 [[EXP32]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L4]], i64 [[EXP64:%.*]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call double @llvm.ldexp.f64.i64(double [[L6]], i64 [[EXP64]])
+; CHECK-NEXT:    store double [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 1
+; CHECK-NEXT:    store double [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 2
+; CHECK-NEXT:    store double [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 3
+; CHECK-NEXT:    store double [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call double @llvm.ldexp.f64.i32(double %l0, i32 %exp32)
+  %l3 = tail call double @llvm.ldexp.f64.i32(double %l2, i32 %exp32)
+  %l5 = tail call double @llvm.ldexp.f64.i64(double %l4, i64 %exp64)
+  %l7 = tail call double @llvm.ldexp.f64.i64(double %l6, i64 %exp64)
+  store double %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds double, ptr %y, i64 1
+  store double %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds double, ptr %y, i64 2
+  store double %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds double, ptr %y, i64 3
+  store double %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+declare float @llvm.ldexp.f32.i32(float, i32)
+declare double @llvm.ldexp.f64.i32(double, i32)
+declare float @llvm.ldexp.f32.i64(float, i64)
+declare double @llvm.ldexp.f64.i64(double, i64)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fround.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fround.ll
new file mode 100644
index 0000000..07a3fe7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fround.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @lround_i32f32(ptr %x, ptr %y, i32 %n) {
+; CHECK-LABEL: @lround_i32f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i32 @llvm.lround.i32.f32(float [[L6]])
+; CHECK-NEXT:    store i32 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i32 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i32 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i32 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call i32 @llvm.lround.i32.f32(float %l0)
+  %l3 = tail call i32 @llvm.lround.i32.f32(float %l2)
+  %l5 = tail call i32 @llvm.lround.i32.f32(float %l4)
+  %l7 = tail call i32 @llvm.lround.i32.f32(float %l6)
+  store i32 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i32, ptr %y, i64 1
+  store i32 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i32, ptr %y, i64 2
+  store i32 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i32, ptr %y, i64 3
+  store i32 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @lround_i32f64(ptr %x, ptr %y, i32 %n) {
+; CHECK-LABEL: @lround_i32f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i32 @llvm.lround.i32.f64(double [[L6]])
+; CHECK-NEXT:    store i32 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i32 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i32 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i32 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call i32 @llvm.lround.i32.f64(double %l0)
+  %l3 = tail call i32 @llvm.lround.i32.f64(double %l2)
+  %l5 = tail call i32 @llvm.lround.i32.f64(double %l4)
+  %l7 = tail call i32 @llvm.lround.i32.f64(double %l6)
+  store i32 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i32, ptr %y, i64 1
+  store i32 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i32, ptr %y, i64 2
+  store i32 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i32, ptr %y, i64 3
+  store i32 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @lround_i64f32(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @lround_i64f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.lround.i64.f32(float [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.lround.i64.f32(float %l0)
+  %l3 = tail call i64 @llvm.lround.i64.f32(float %l2)
+  %l5 = tail call i64 @llvm.lround.i64.f32(float %l4)
+  %l7 = tail call i64 @llvm.lround.i64.f32(float %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @lround_i64f64(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @lround_i64f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.lround.i64.f64(double [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.lround.i64.f64(double %l0)
+  %l3 = tail call i64 @llvm.lround.i64.f64(double %l2)
+  %l5 = tail call i64 @llvm.lround.i64.f64(double %l4)
+  %l7 = tail call i64 @llvm.lround.i64.f64(double %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @llround_i64f32(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @llround_i64f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.llround.i64.f32(float [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr %x, i64 1
+  %l2 = load float, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr %x, i64 2
+  %l4 = load float, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr %x, i64 3
+  %l6 = load float, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.llround.i64.f32(float %l0)
+  %l3 = tail call i64 @llvm.llround.i64.f32(float %l2)
+  %l5 = tail call i64 @llvm.llround.i64.f32(float %l4)
+  %l7 = tail call i64 @llvm.llround.i64.f32(float %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+define void @llround_i64f64(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @llround_i64f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 1
+; CHECK-NEXT:    [[L2:%.*]] = load double, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[L4:%.*]] = load double, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 3
+; CHECK-NEXT:    [[L6:%.*]] = load double, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L0]])
+; CHECK-NEXT:    [[L3:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L2]])
+; CHECK-NEXT:    [[L5:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L4]])
+; CHECK-NEXT:    [[L7:%.*]] = tail call i64 @llvm.llround.i64.f64(double [[L6]])
+; CHECK-NEXT:    store i64 [[L1]], ptr [[Y:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 1
+; CHECK-NEXT:    store i64 [[L3]], ptr [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 2
+; CHECK-NEXT:    store i64 [[L5]], ptr [[ARRAYIDX2_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i64, ptr [[Y]], i64 3
+; CHECK-NEXT:    store i64 [[L7]], ptr [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load double, ptr %x, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr %x, i64 1
+  %l2 = load double, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr %x, i64 2
+  %l4 = load double, ptr %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr %x, i64 3
+  %l6 = load double, ptr %arrayidx.3, align 4
+  %l1 = tail call i64 @llvm.llround.i64.f64(double %l0)
+  %l3 = tail call i64 @llvm.llround.i64.f64(double %l2)
+  %l5 = tail call i64 @llvm.llround.i64.f64(double %l4)
+  %l7 = tail call i64 @llvm.llround.i64.f64(double %l6)
+  store i64 %l1, ptr %y, align 4
+  %arrayidx2.1 = getelementptr inbounds i64, ptr %y, i64 1
+  store i64 %l3, ptr %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds i64, ptr %y, i64 2
+  store i64 %l5, ptr %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds i64, ptr %y, i64 3
+  store i64 %l7, ptr %arrayidx2.3, align 4
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+declare i64 @llvm.lround.i64.f32(float)
+declare i64 @llvm.lround.i64.f64(double)
+declare i64 @llvm.llround.i64.f32(float)
+declare i64 @llvm.llround.i64.f64(double)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 295a718..2e68432 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -12,7 +12,8 @@ define void @test() {
 ; CHECK:       [[BB63]]:
 ; CHECK-NEXT:    br label %[[BB64]]
 ; CHECK:       [[BB64]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
 ; CHECK-NEXT:    [[I66:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I67:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I68:%.*]] = load float, ptr poison, align 8
@@ -24,57 +25,125 @@ define void @test() {
 ; CHECK-NEXT:    [[I74:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I75:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I76:%.*]] = load float, ptr poison, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
 ; CHECK:       [[BB77]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
-; CHECK-NEXT:    [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT:    [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I87:%.*]] = fmul fast float [[I85]], poison
+; CHECK-NEXT:    [[I88:%.*]] = fmul fast float [[I80]], poison
+; CHECK-NEXT:    [[I89:%.*]] = fmul fast float [[I81]], poison
+; CHECK-NEXT:    [[I90:%.*]] = fmul fast float [[I82]], poison
+; CHECK-NEXT:    [[I91:%.*]] = fmul fast float [[I84]], poison
+; CHECK-NEXT:    [[I92:%.*]] = fadd fast float [[I91]], [[I87]]
+; CHECK-NEXT:    [[I93:%.*]] = fmul fast float [[I127]], poison
+; CHECK-NEXT:    [[I94:%.*]] = fadd fast float [[I93]], [[I88]]
+; CHECK-NEXT:    [[I95:%.*]] = fmul fast float [[I131]], poison
+; CHECK-NEXT:    [[I96:%.*]] = fadd fast float [[I95]], [[I89]]
+; CHECK-NEXT:    [[I97:%.*]] = fmul fast float [[I86]], poison
+; CHECK-NEXT:    [[I98:%.*]] = fadd fast float [[I97]], [[I90]]
+; CHECK-NEXT:    [[I99:%.*]] = fadd fast float [[I92]], poison
+; CHECK-NEXT:    [[I100:%.*]] = fadd fast float [[I94]], poison
+; CHECK-NEXT:    [[I101:%.*]] = fadd fast float [[I96]], poison
+; CHECK-NEXT:    [[I102:%.*]] = fadd fast float [[I98]], poison
+; CHECK-NEXT:    [[I103]] = fadd fast float [[I99]], poison
+; CHECK-NEXT:    [[I104]] = fadd fast float [[I100]], poison
+; CHECK-NEXT:    [[I105]] = fadd fast float [[I101]], poison
+; CHECK-NEXT:    [[I106]] = fadd fast float [[I102]], poison
+; CHECK-NEXT:    [[I107:%.*]] = fmul fast float [[I85]], poison
+; CHECK-NEXT:    [[I108:%.*]] = fmul fast float [[I80]], poison
+; CHECK-NEXT:    [[I109:%.*]] = fmul fast float [[I81]], poison
+; CHECK-NEXT:    [[I110:%.*]] = fmul fast float [[I82]], poison
+; CHECK-NEXT:    [[I111:%.*]] = fmul fast float [[I84]], poison
+; CHECK-NEXT:    [[I112:%.*]] = fadd fast float [[I111]], [[I107]]
+; CHECK-NEXT:    [[I113:%.*]] = fmul fast float [[I127]], poison
+; CHECK-NEXT:    [[I114:%.*]] = fadd fast float [[I113]], [[I108]]
+; CHECK-NEXT:    [[I115:%.*]] = fmul fast float [[I131]], poison
+; CHECK-NEXT:    [[I116:%.*]] = fadd fast float [[I115]], [[I109]]
+; CHECK-NEXT:    [[I117:%.*]] = fmul fast float [[I86]], poison
+; CHECK-NEXT:    [[I118:%.*]] = fadd fast float [[I117]], [[I110]]
+; CHECK-NEXT:    [[I119:%.*]] = fadd fast float [[I112]], poison
+; CHECK-NEXT:    [[I120:%.*]] = fadd fast float [[I114]], poison
+; CHECK-NEXT:    [[I121:%.*]] = fadd fast float [[I116]], poison
+; CHECK-NEXT:    [[I122:%.*]] = fadd fast float [[I118]], poison
+; CHECK-NEXT:    [[I123]] = fadd fast float [[I119]], poison
+; CHECK-NEXT:    [[I124]] = fadd fast float [[I120]], poison
+; CHECK-NEXT:    [[I125]] = fadd fast float [[I121]], poison
+; CHECK-NEXT:    [[I126]] = fadd fast float [[I122]], poison
+; CHECK-NEXT:    [[I135:%.*]] = fmul fast float [[I85]], [[I65]]
+; CHECK-NEXT:    [[I128:%.*]] = fmul fast float [[I80]], [[I65]]
+; CHECK-NEXT:    [[I129:%.*]] = fmul fast float [[I81]], [[I65]]
+; CHECK-NEXT:    [[I130:%.*]] = fmul fast float [[I82]], [[I65]]
+; CHECK-NEXT:    [[I133:%.*]] = fmul fast float [[I84]], [[I77]]
+; CHECK-NEXT:    [[I134:%.*]] = fadd fast float [[I133]], [[I135]]
+; CHECK-NEXT:    [[I136:%.*]] = fmul fast float [[I127]], [[I77]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fadd fast float [[I136]], [[I128]]
+; CHECK-NEXT:    [[I138:%.*]] = fmul fast float [[I131]], [[I77]]
+; CHECK-NEXT:    [[TMP52:%.*]] = fadd fast float [[I138]], [[I129]]
+; CHECK-NEXT:    [[I137:%.*]] = fmul fast float [[I86]], [[I77]]
+; CHECK-NEXT:    [[I139:%.*]] = fadd fast float [[I137]], [[I130]]
+; CHECK-NEXT:    [[I140:%.*]] = fadd fast float [[I134]], poison
+; CHECK-NEXT:    [[I141:%.*]] = fadd fast float [[TMP51]], poison
+; CHECK-NEXT:    [[I142:%.*]] = fadd fast float [[TMP52]], poison
+; CHECK-NEXT:    [[I143:%.*]] = fadd fast float [[I139]], poison
+; CHECK-NEXT:    [[I144:%.*]] = fadd fast float [[I140]], poison
+; CHECK-NEXT:    [[I145:%.*]] = fadd fast float [[I141]], poison
+; CHECK-NEXT:    [[I146:%.*]] = fadd fast float [[I142]], poison
+; CHECK-NEXT:    [[I152:%.*]] = fadd fast float [[I143]], poison
+; CHECK-NEXT:    [[I147:%.*]] = fmul fast float [[I85]], poison
+; CHECK-NEXT:    [[I148:%.*]] = fmul fast float [[I80]], poison
+; CHECK-NEXT:    [[I149:%.*]] = fmul fast float [[I81]], poison
+; CHECK-NEXT:    [[I150:%.*]] = fmul fast float [[I82]], poison
+; CHECK-NEXT:    [[I151:%.*]] = fmul fast float [[I84]], poison
+; CHECK-NEXT:    [[TMP57:%.*]] = fadd fast float [[I151]], [[I147]]
+; CHECK-NEXT:    [[I153:%.*]] = fmul fast float [[I127]], poison
+; CHECK-NEXT:    [[TMP58:%.*]] = fadd fast float [[I153]], [[I148]]
+; CHECK-NEXT:    [[I155:%.*]] = fmul fast float [[I131]], poison
+; CHECK-NEXT:    [[TMP59:%.*]] = fadd fast float [[I155]], [[I149]]
+; CHECK-NEXT:    [[I157:%.*]] = fmul fast float [[I86]], poison
+; CHECK-NEXT:    [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]]
+; CHECK-NEXT:    [[I159:%.*]] = fadd fast float [[TMP57]], poison
+; CHECK-NEXT:    [[I160:%.*]] = fadd fast float [[TMP58]], poison
+; CHECK-NEXT:    [[I161:%.*]] = fadd fast float [[TMP59]], poison
+; CHECK-NEXT:    [[I162:%.*]] = fadd fast float [[TMP60]], poison
+; CHECK-NEXT:    [[I163:%.*]] = fadd fast float [[I159]], poison
+; CHECK-NEXT:    [[I164:%.*]] = fadd fast float [[I160]], poison
+; CHECK-NEXT:    [[I165:%.*]] = fadd fast float [[I161]], poison
+; CHECK-NEXT:    [[I166:%.*]] = fadd fast float [[I162]], poison
 ; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
 ; CHECK:       [[BB167]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP32]], i32 14
+; CHECK-NEXT:    [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[I166]], %[[BB78]] ]
+; CHECK-NEXT:    [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I165]], %[[BB78]] ]
+; CHECK-NEXT:    [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I164]], %[[BB78]] ]
+; CHECK-NEXT:    [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[I163]], %[[BB78]] ]
+; CHECK-NEXT:    [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[I152]], %[[BB78]] ]
+; CHECK-NEXT:    [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[I146]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[I145]], %[[BB78]] ]
+; CHECK-NEXT:    [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[I144]], %[[BB78]] ]
+; CHECK-NEXT:    [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[I126]], %[[BB78]] ]
+; CHECK-NEXT:    [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I125]], %[[BB78]] ]
+; CHECK-NEXT:    [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[I124]], %[[BB78]] ]
+; CHECK-NEXT:    [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I123]], %[[BB78]] ]
+; CHECK-NEXT:    [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I106]], %[[BB78]] ]
+; CHECK-NEXT:    [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[I105]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[I104]], %[[BB78]] ]
+; CHECK-NEXT:    [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[I103]], %[[BB78]] ]
 ; CHECK-NEXT:    store float [[TMP33]], ptr poison, align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP32]], i32 13
 ; CHECK-NEXT:    store float [[TMP34]], ptr poison, align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP32]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]]
 ; CHECK:       [[BB184]]:
 ; CHECK-NEXT:    br label %[[BB185:.*]]
 ; CHECK:       [[BB185]]:
 ; CHECK-NEXT:    br i1 poison, label %[[BB185]], label %[[BB186]]
 ; CHECK:       [[BB186]]:
-; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[TMP35]], %[[BB167]] ], [ poison, %[[BB185]] ]
+; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[I178]], %[[BB167]] ], [ poison, %[[BB185]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index 64bdcf2..8093285 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -8,35 +8,56 @@
 define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP9]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00
+; CHECK-NEXT:    [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00
+; CHECK-NEXT:    [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00
+; CHECK-NEXT:    [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
 ; CHECK-NEXT:    br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
 ; CHECK:       bb18:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ]
-; CHECK-NEXT:    [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
+; CHECK-NEXT:    [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ]
+; CHECK-NEXT:    [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ]
+; CHECK-NEXT:    [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ]
+; CHECK-NEXT:    [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ]
 ; CHECK-NEXT:    [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
-; CHECK-NEXT:    [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
 ; CHECK-NEXT:    [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
 ; CHECK-NEXT:    br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
 ; CHECK:       bb25:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ]
+; CHECK-NEXT:    [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ]
 ; CHECK-NEXT:    br label [[BB30:%.*]]
 ; CHECK:       bb30:
 ; CHECK-NEXT:    [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
 ; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]])
+; CHECK-NEXT:    [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT:    [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float
+; CHECK-NEXT:    [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1
+; CHECK-NEXT:    [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1
+; CHECK-NEXT:    [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float
+; CHECK-NEXT:    [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2
+; CHECK-NEXT:    [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1
+; CHECK-NEXT:    [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float
+; CHECK-NEXT:    [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3
+; CHECK-NEXT:    [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1
+; CHECK-NEXT:    [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float
+; CHECK-NEXT:    [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]]
+; CHECK-NEXT:    [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]]
+; CHECK-NEXT:    [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]]
+; CHECK-NEXT:    [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]]
+; CHECK-NEXT:    [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]]
+; CHECK-NEXT:    [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]]
+; CHECK-NEXT:    [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]]
+; CHECK-NEXT:    [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]]
+; CHECK-NEXT:    [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]]
+; CHECK-NEXT:    [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]]
+; CHECK-NEXT:    [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]]
 ; CHECK-NEXT:    [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
 ; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
 ; CHECK-NEXT:    call void @ham(float [[VAL55]], float [[VAL56]])
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 27de36e..430a46b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -600,29 +600,25 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
 }
 
 define float @dot_product_fp32(ptr %a, ptr %b) {
-; NON-POW2-LABEL: @dot_product_fp32(
-; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
-; NON-POW2-NEXT:    ret float [[TMP4]]
-;
-; POW2-ONLY-LABEL: @dot_product_fp32(
-; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret float [[ADD_1]]
+; CHECK-LABEL: @dot_product_fp32(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
   %l.a.0 = load float, ptr %gep.a.0, align 4
@@ -650,29 +646,25 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
 ; Same as above, except the reduction order has been perturbed.  This
 ; is checking for our ability to reorder.
 define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
-; NON-POW2-LABEL: @dot_product_fp32_reorder(
-; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
-; NON-POW2-NEXT:    ret float [[TMP4]]
-;
-; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
-; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret float [[ADD_1]]
+; CHECK-LABEL: @dot_product_fp32_reorder(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
   %l.a.0 = load float, ptr %gep.a.0, align 4
@@ -699,29 +691,25 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
 
 
 define double @dot_product_fp64(ptr %a, ptr %b) {
-; NON-POW2-LABEL: @dot_product_fp64(
-; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
-; NON-POW2-NEXT:    ret double [[TMP4]]
-;
-; POW2-ONLY-LABEL: @dot_product_fp64(
-; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
-; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
-; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret double [[ADD_1]]
+; CHECK-LABEL: @dot_product_fp64(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret double [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
   %l.a.0 = load double, ptr %gep.a.0, align 4
@@ -778,21 +766,13 @@ entry:
 }
 
 define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
-; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
-; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
-; NON-POW2-NEXT:    [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
-; NON-POW2-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
-; NON-POW2-NEXT:    ret float [[TMP5]]
-;
-; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret float [[ADD_1]]
+; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
 ;
   %mul.0 = fmul fast float %a, 10.0
   %mul.1 = fmul fast float %b, 10.0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index 4a8af6d..0879ec2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
 
 ;
 ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3]))
@@ -95,12 +95,47 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 }
 
 define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; CHECK-LABEL: @dot4f64_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; CHECK-NEXT:    ret double [[TMP4]]
+; SSE2-LABEL: @dot4f64_fast(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; SSE2-NEXT:    ret double [[TMP4]]
+;
+; SSE4-LABEL: @dot4f64_fast(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; SSE4-NEXT:    ret double [[TMP4]]
+;
+; AVX-LABEL: @dot4f64_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; AVX-NEXT:    ret double [[TMP4]]
+;
+; AVX2-LABEL: @dot4f64_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX]], i64 2
+; AVX2-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY]], i64 2
+; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load double, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT0123:%.*]] = fadd fast double [[DOT012]], [[TMP5]]
+; AVX2-NEXT:    ret double [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -127,12 +162,47 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
 }
 
 define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot4f32_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; CHECK-NEXT:    ret float [[TMP4]]
+; SSE2-LABEL: @dot4f32_fast(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; SSE2-NEXT:    ret float [[TMP4]]
+;
+; SSE4-LABEL: @dot4f32_fast(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; SSE4-NEXT:    ret float [[TMP4]]
+;
+; AVX-LABEL: @dot4f32_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; AVX-NEXT:    ret float [[TMP4]]
+;
+; AVX2-LABEL: @dot4f32_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX]], i64 2
+; AVX2-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY]], i64 2
+; AVX2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load float, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT0123:%.*]] = fadd fast float [[DOT012]], [[TMP5]]
+; AVX2-NEXT:    ret float [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
@@ -372,6 +442,18 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1
 ; AVX-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    ret double [[DOT01]]
 ;
+; AVX2-LABEL: @dot2f64_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load double, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    ret double [[DOT01]]
+;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
   %x0 = load double, ptr %ptrx, align 4
@@ -410,6 +492,18 @@ define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16
 ; AVX-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    ret float [[DOT01]]
 ;
+; AVX2-LABEL: @dot2f32_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[X1:%.*]] = load float, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; AVX2-NEXT:    [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]]
+; AVX2-NEXT:    ret float [[DOT01]]
+;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
   %x0 = load float, ptr %ptrx, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index eaa77d7..0bbdeb55 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -31,12 +31,9 @@ define float @baz() {
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], 2.000000e+00
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 1
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], splat (float 2.000000e+00)
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP4]], 2.000000e+00
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
 ; THRESHOLD-NEXT:    store float [[OP_RDX]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
@@ -76,14 +73,41 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; CHECK-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
 ; CHECK-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; CHECK-NEXT:    ret float [[OP_RDX1]]
 ;
@@ -92,14 +116,41 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
 ; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
+; THRESHOLD-NEXT:    [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
+; THRESHOLD-NEXT:    [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
+; THRESHOLD-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
+; THRESHOLD-NEXT:    [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
+; THRESHOLD-NEXT:    [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
+; THRESHOLD-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
+; THRESHOLD-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
+; THRESHOLD-NEXT:    [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
 ; THRESHOLD-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
 ;
@@ -151,10 +202,21 @@ define float @bazzz() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    store float [[TMP5]], ptr @res, align 4
 ; CHECK-NEXT:    ret float [[TMP5]]
@@ -163,10 +225,21 @@ define float @bazzz() {
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    store float [[TMP5]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[TMP5]]
@@ -199,10 +272,21 @@ define i32 @foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; CHECK-NEXT:    store i32 [[CONV4]], ptr @n, align 4
@@ -212,10 +296,21 @@ define i32 @foo() {
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
+; THRESHOLD-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; THRESHOLD-NEXT:    store i32 [[CONV4]], ptr @n, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 1922e935..4527929 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,17 +10,65 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
-; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
+; CHECK-NEXT:    [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8
+; CHECK-NEXT:    [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
+; CHECK-NEXT:    [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], [[LD1_0]]
+; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 16
+; CHECK-NEXT:    [[LD2_0:%.*]] = load double, ptr [[GEP2_0]], align 8
+; CHECK-NEXT:    [[MUL2_0:%.*]] = fmul fast double [[LD2_0]], [[LD1_0]]
+; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3
+; CHECK-NEXT:    [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8
+; CHECK-NEXT:    [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
+; CHECK-NEXT:    [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
+; CHECK-NEXT:    [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], [[LD1_1]]
+; CHECK-NEXT:    [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
+; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 17
+; CHECK-NEXT:    [[LD2_1:%.*]] = load double, ptr [[GEP2_1]], align 8
+; CHECK-NEXT:    [[MUL2_1:%.*]] = fmul fast double [[LD2_1]], [[LD1_1]]
+; CHECK-NEXT:    [[RDX2_0:%.*]] = fadd fast double [[MUL2_0]], [[MUL2_1]]
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5
+; CHECK-NEXT:    [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8
+; CHECK-NEXT:    [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
+; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 18
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7
+; CHECK-NEXT:    [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8
+; CHECK-NEXT:    [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9
+; CHECK-NEXT:    [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8
+; CHECK-NEXT:    [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11
+; CHECK-NEXT:    [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[LD1_2]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[LD1_3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[LD1_4]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[LD1_5]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[GEP2_2]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13
+; CHECK-NEXT:    [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8
+; CHECK-NEXT:    [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
+; CHECK-NEXT:    [[LD0_6:%.*]] = load double, ptr [[GEP0_6]], align 8
+; CHECK-NEXT:    [[MUL1_6:%.*]] = fmul fast double [[LD0_6]], [[LD1_6]]
+; CHECK-NEXT:    [[GEP2_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 22
+; CHECK-NEXT:    [[LD2_6:%.*]] = load double, ptr [[GEP2_6]], align 8
+; CHECK-NEXT:    [[MUL2_6:%.*]] = fmul fast double [[LD2_6]], [[LD1_6]]
+; CHECK-NEXT:    [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15
+; CHECK-NEXT:    [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8
+; CHECK-NEXT:    [[GEP0_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 7
+; CHECK-NEXT:    [[LD0_7:%.*]] = load double, ptr [[GEP0_7]], align 8
+; CHECK-NEXT:    [[MUL1_7:%.*]] = fmul fast double [[LD0_7]], [[LD1_7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast double [[TMP10]], [[MUL1_6]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = fadd fast double [[MUL1_7]], [[RDX1_0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast double [[OP_RDX3]], [[OP_RDX4]]
+; CHECK-NEXT:    [[GEP2_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 23
+; CHECK-NEXT:    [[LD2_7:%.*]] = load double, ptr [[GEP2_7]], align 8
+; CHECK-NEXT:    [[MUL2_7:%.*]] = fmul fast double [[LD2_7]], [[LD1_7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP11]], [[MUL2_6]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast double [[MUL2_7]], [[RDX2_0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast double [[OP_RDX]], [[OP_RDX1]]
 ; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
 ; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
index f0272d5..33c281d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
@@ -6,9 +6,25 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @rdx_feeds_single_insert(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], <double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01>
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]])
+; CHECK-NEXT:    [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
+; CHECK-NEXT:    [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], 1.000000e+01
+; CHECK-NEXT:    [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
+; CHECK-NEXT:    [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
+; CHECK-NEXT:    [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], 1.100000e+01
+; CHECK-NEXT:    [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
+; CHECK-NEXT:    [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x double> [[TMP0]], <double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01>
+; CHECK-NEXT:    [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[GEP0_6]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> [[TMP3]], <double 1.600000e+01, double 1.700000e+01>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RDX1_0]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast double [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
index 8c9f8b5..359c24b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -1,27 +1,39 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=CHECK
+; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
+; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=AVX
 
 ; This test checks for a case when a horizontal reduction of floating-point
 ; adds may look profitable, but is not because it eliminates generation of
 ; floating-point FMAs that would be more profitable.
 
-; FIXME: We generate a horizontal reduction today.
-
 define void @hr() {
-; CHECK-LABEL: @hr(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; SSE4-LABEL: @hr(
+; SSE4-NEXT:    br label [[LOOP:%.*]]
+; SSE4:       loop:
+; SSE4-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
+; SSE4-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
+; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
+; SSE4-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
+; SSE4-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; SSE4-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
+; SSE4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+; SSE4:       exit:
+; SSE4-NEXT:    ret void
+;
+; AVX-LABEL: @hr(
+; AVX-NEXT:    br label [[LOOP:%.*]]
+; AVX:       loop:
+; AVX-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
+; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
+; AVX-NEXT:    [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
+; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]]
+; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]]
+; AVX-NEXT:    [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]]
+; AVX-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+; AVX:       exit:
+; AVX-NEXT:    ret void
 ;
   br label %loop
 
@@ -47,18 +59,27 @@ exit:
 ; may look profitable; but both are not because this eliminates generation
 ; of floating-point FMAs that would be more profitable.
 
-; FIXME: We generate a horizontal reduction today, and if that's disabled, we
-; still vectorize some of the multiplies.
-
 define double @hr_or_mul() {
-; CHECK-LABEL: @hr_or_mul(
-; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
-; CHECK-NEXT:    ret double [[OP_RDX]]
+; SSE4-LABEL: @hr_or_mul(
+; SSE4-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
+; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; SSE4-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
+; SSE4-NEXT:    ret double [[OP_RDX]]
+;
+; AVX-LABEL: @hr_or_mul(
+; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
+; AVX-NEXT:    [[MUL0:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[CVT0]]
+; AVX-NEXT:    [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
+; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]]
+; AVX-NEXT:    [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
+; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
+; AVX-NEXT:    [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
+; AVX-NEXT:    ret double [[ADD3]]
 ;
   %cvt0 = uitofp i16 3 to double
   %mul0 = fmul fast double 7.000000e+00, %cvt0
diff --git a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
index a64075d..5fe02cb 100644
--- a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
@@ -1,32 +1,57 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @test() {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[BODY:%.*]]
-; CHECK:       body:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP8]]
-; CHECK-NEXT:    [[ADD8_I_I:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP9]])
-; CHECK-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
-; CHECK-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
-; CHECK:       if.then135.i:
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP1]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    br label [[IF_END209_I]]
-; CHECK:       if.end209.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
-; CHECK-NEXT:    ret void
+; X86-LABEL: @test(
+; X86-NEXT:  entry:
+; X86-NEXT:    br label [[BODY:%.*]]
+; X86:       body:
+; X86-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
+; X86-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
+; X86-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP1]]
+; X86-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP2]])
+; X86-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[TMP3]], 0.000000e+00
+; X86-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
+; X86:       exit:
+; X86-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
+; X86:       if.then135.i:
+; X86-NEXT:    [[TMP4:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP4]], <2 x i32> <i32 2, i32 1>
+; X86-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
+; X86-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = fmul fast <2 x double> [[TMP7]], zeroinitializer
+; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], zeroinitializer
+; X86-NEXT:    br label [[IF_END209_I]]
+; X86:       if.end209.i:
+; X86-NEXT:    [[TMP10:%.*]] = phi <2 x double> [ [[TMP9]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @test(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    br label [[BODY:%.*]]
+; AARCH64:       body:
+; AARCH64-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
+; AARCH64-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
+; AARCH64-NEXT:    [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
+; AARCH64-NEXT:    [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
+; AARCH64-NEXT:    [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
+; AARCH64-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
+; AARCH64-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
+; AARCH64:       exit:
+; AARCH64-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
+; AARCH64:       if.then135.i:
+; AARCH64-NEXT:    [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
+; AARCH64-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP1]], <2 x i32> <i32 2, i32 1>
+; AARCH64-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
+; AARCH64-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer
+; AARCH64-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer
+; AARCH64-NEXT:    br label [[IF_END209_I]]
+; AARCH64:       if.end209.i:
+; AARCH64-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
+; AARCH64-NEXT:    ret void
 ;
 entry:
   br label %body
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index 1e4b598..b5d74f0b 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -1,24 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH86 %}
 
 define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; CHECK-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; CHECK-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; CHECK-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; CHECK-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    ret <4 x double> [[TMP7]]
+; X86-LABEL: @test(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
+; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AARCH86-LABEL: @test(
+; AARCH86-NEXT:  entry:
+; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
+; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[I1773:%.*]] = fmul fast double [[I1772]], [[I1754:%.*]]
+; AARCH86-NEXT:    [[I1782:%.*]] = fmul fast double [[I1754]], [[I1754]]
+; AARCH86-NEXT:    [[I1783:%.*]] = fadd fast double [[I1782]], 1.000000e+00
+; AARCH86-NEXT:    [[I1787:%.*]] = fmul fast double [[I1778:%.*]], [[I1754]]
+; AARCH86-NEXT:    [[I1788:%.*]] = fadd fast double [[I1787]], 1.000000e+00
+; AARCH86-NEXT:    [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]]
+; AARCH86-NEXT:    [[I1793:%.*]] = fadd fast double [[I1792]], 1.000000e+00
+; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781]]
+; AARCH86-NEXT:    [[TMP4:%.*]] = fadd fast double [[I1773]], [[I1797]]
+; AARCH86-NEXT:    [[I1976:%.*]] = insertelement <4 x double> zeroinitializer, double [[I1783]], i64 0
+; AARCH86-NEXT:    [[I1982:%.*]] = insertelement <4 x double> [[I1976]], double [[I1788]], i64 1
+; AARCH86-NEXT:    [[I1988:%.*]] = insertelement <4 x double> [[I1982]], double [[I1793]], i64 2
+; AARCH86-NEXT:    [[I1994:%.*]] = insertelement <4 x double> [[I1988]], double [[TMP4]], i64 3
+; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;
 entry:
   %i1771 = getelementptr inbounds double, ptr %p2, i64 54
diff --git a/llvm/test/Transforms/Scalarizer/intrinsics.ll b/llvm/test/Transforms/Scalarizer/intrinsics.ll
index cee44ef..070c765 100644
--- a/llvm/test/Transforms/Scalarizer/intrinsics.ll
+++ b/llvm/test/Transforms/Scalarizer/intrinsics.ll
@@ -8,6 +8,7 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>)
 
 ; Ternary fp
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
@@ -32,6 +33,8 @@ declare <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float>)
 ; Unary fp operand, int return type
 declare <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float>)
 declare <2 x i32> @llvm.llrint.v2i32.v2f32(<2 x float>)
+declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float>)
+declare <2 x i32> @llvm.llround.v2i32.v2f32(<2 x float>)
 
 ; Bool return type, overloaded on fp operand type
 declare <2 x i1> @llvm.is.fpclass(<2 x float>, i32)
@@ -159,6 +162,22 @@ define <2 x float> @scalarize_powi_v2f32(<2 x float> %x, i32 %y) #0 {
   ret <2 x float> %powi
 }
 
+define <2 x float> @scalarize_ldexp_v2f32(<2 x float> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: @scalarize_ldexp_v2f32(
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[Y:%.*]] = extractelement <2 x i32> [[Y1:%.*]], i64 0
+; CHECK-NEXT:    [[POWI_I0:%.*]] = call float @llvm.ldexp.f32.i32(float [[X_I0]], i32 [[Y]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[Y_I1:%.*]] = extractelement <2 x i32> [[Y1]], i64 1
+; CHECK-NEXT:    [[POWI_I1:%.*]] = call float @llvm.ldexp.f32.i32(float [[X_I1]], i32 [[Y_I1]])
+; CHECK-NEXT:    [[POWI_UPTO0:%.*]] = insertelement <2 x float> poison, float [[POWI_I0]], i64 0
+; CHECK-NEXT:    [[POWI:%.*]] = insertelement <2 x float> [[POWI_UPTO0]], float [[POWI_I1]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[POWI]]
+;
+  %powi = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> %y)
+  ret <2 x float> %powi
+}
+
 define <2 x i32> @scalarize_smul_fix_sat_v2i32(<2 x i32> %x) #0 {
 ; CHECK-LABEL: @scalarize_smul_fix_sat_v2i32(
 ; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -243,6 +262,34 @@ define <2 x i32> @scalarize_llrint(<2 x float> %x) #0 {
   ret <2 x i32> %rnd
 }
 
+define <2 x i32> @scalarize_lround(<2 x float> %x) #0 {
+; CHECK-LABEL: @scalarize_lround(
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.lround.i32.f32(float [[X_I0]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.lround.i32.f32(float [[X_I1]])
+; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
+; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
+; CHECK-NEXT:    ret <2 x i32> [[RND]]
+;
+  %rnd = call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %x)
+  ret <2 x i32> %rnd
+}
+
+define <2 x i32> @scalarize_llround(<2 x float> %x) #0 {
+; CHECK-LABEL: @scalarize_llround(
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.llround.i32.f32(float [[X_I0]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.llround.i32.f32(float [[X_I1]])
+; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
+; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
+; CHECK-NEXT:    ret <2 x i32> [[RND]]
+;
+  %rnd = call <2 x i32> @llvm.llround.v2i32.v2f32(<2 x float> %x)
+  ret <2 x i32> %rnd
+}
+
 define <2 x i1> @scalarize_is_fpclass(<2 x float> %x) #0 {
 ; CHECK-LABEL: @scalarize_is_fpclass(
 ; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index d6e29a3..6dccf21 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -1749,7 +1749,7 @@ static void DumpLiteralPointerSection(MachOObjectFile *O,
 
     StringRef BytesStr = unwrapOrError(Sect->getContents(), O->getFileName());
 
-    const char *Contents = reinterpret_cast<const char *>(BytesStr.data());
+    const char *Contents = BytesStr.data();
 
     switch (section_type) {
     case MachO::S_CSTRING_LITERALS:
@@ -1965,7 +1965,7 @@ static void DumpSectionContents(StringRef Filename, MachOObjectFile *O,
 
         StringRef BytesStr =
             unwrapOrError(Section.getContents(), O->getFileName());
-        const char *sect = reinterpret_cast<const char *>(BytesStr.data());
+        const char *sect = BytesStr.data();
         uint32_t sect_size = BytesStr.size();
         uint64_t sect_addr = Section.getAddress();
 
@@ -2049,7 +2049,7 @@ static void DumpInfoPlistSectionContents(StringRef Filename,
         outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
       StringRef BytesStr =
           unwrapOrError(Section.getContents(), O->getFileName());
-      const char *sect = reinterpret_cast<const char *>(BytesStr.data());
+      const char *sect = BytesStr.data();
       outs() << format("%.*s", BytesStr.size(), sect) << "\n";
       return;
     }
@@ -3237,7 +3237,7 @@ static const char *GuessCstringPointer(uint64_t ReferenceValue,
           uint64_t object_offset = Sec.offset + sect_offset;
           StringRef MachOContents = info->O->getData();
           uint64_t object_size = MachOContents.size();
-          const char *object_addr = (const char *)MachOContents.data();
+          const char *object_addr = MachOContents.data();
           if (object_offset < object_size) {
             const char *name = object_addr + object_offset;
             return name;
@@ -3258,7 +3258,7 @@ static const char *GuessCstringPointer(uint64_t ReferenceValue,
           uint64_t object_offset = Sec.offset + sect_offset;
           StringRef MachOContents = info->O->getData();
           uint64_t object_size = MachOContents.size();
-          const char *object_addr = (const char *)MachOContents.data();
+          const char *object_addr = MachOContents.data();
           if (object_offset < object_size) {
             const char *name = object_addr + object_offset;
             return name;
@@ -3447,7 +3447,7 @@ static uint64_t GuessPointerPointer(uint64_t ReferenceValue,
           uint64_t object_offset = Sec.offset + sect_offset;
           StringRef MachOContents = info->O->getData();
           uint64_t object_size = MachOContents.size();
-          const char *object_addr = (const char *)MachOContents.data();
+          const char *object_addr = MachOContents.data();
           if (object_offset < object_size) {
             uint64_t pointer_value;
             memcpy(&pointer_value, object_addr + object_offset,
@@ -4350,7 +4350,7 @@ walk_pointer_list_64(const char *listname, const SectionRef S,
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
 
   StringRef BytesStr = unwrapOrError(S.getContents(), O->getFileName());
-  const char *Contents = reinterpret_cast<const char *>(BytesStr.data());
+  const char *Contents = BytesStr.data();
 
   for (uint32_t i = 0; i < S.getSize(); i += sizeof(uint64_t)) {
     uint32_t left = S.getSize() - i;
@@ -4399,7 +4399,7 @@ walk_pointer_list_32(const char *listname, const SectionRef S,
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
 
   StringRef BytesStr = unwrapOrError(S.getContents(), O->getFileName());
-  const char *Contents = reinterpret_cast<const char *>(BytesStr.data());
+  const char *Contents = BytesStr.data();
 
   for (uint32_t i = 0; i < S.getSize(); i += sizeof(uint32_t)) {
     uint32_t left = S.getSize() - i;
diff --git a/llvm/unittests/Support/DebugLogTest.cpp b/llvm/unittests/Support/DebugLogTest.cpp
index c24d1a5..b28c59c 100644
--- a/llvm/unittests/Support/DebugLogTest.cpp
+++ b/llvm/unittests/Support/DebugLogTest.cpp
@@ -6,11 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This macro is defined in the LLVM build system, but we undefine it here
-// so that we test at least once in-tree the case where __SHORT_FILE__ is not
-// defined.
-#undef __SHORT_FILE__
-
 #include "llvm/Support/DebugLog.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Support/raw_ostream.h"