208 files changed, 6916 insertions, 1537 deletions
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8b6c25c..4884e2d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -21074,12 +21074,12 @@ Overview:
 
 The '``llvm.matrix.column.major.load.*``' intrinsics load a ``<Rows> x <Cols>``
 matrix using a stride of ``%Stride`` to compute the start address of the
-different columns.  The offset is computed using ``%Stride``'s bitwidth. This
-allows for convenient loading of sub matrixes. If ``<IsVolatile>`` is true, the
-intrinsic is considered a :ref:`volatile memory access <volatile>`. The result
-matrix is returned in the result vector. If the ``%Ptr`` argument is known to
-be aligned to some boundary, this can be specified as an attribute on the
-argument.
+different columns.  This allows for convenient loading of sub matrixes.
+Independent of ``%Stride``'s bitwidth, the offset is computed using the target
+daya layout's pointer index type. If ``<IsVolatile>`` is true, the intrinsic is
+considered a :ref:`volatile memory access <volatile>`.  The result matrix is
+returned in the result vector. If the ``%Ptr`` argument is known to be aligned
+to some boundary, this can be specified as an attribute on the argument.
 
 Arguments:
 """"""""""
@@ -21114,9 +21114,9 @@ Overview:
 
 The '``llvm.matrix.column.major.store.*``' intrinsics store the ``<Rows> x
 <Cols>`` matrix in ``%In`` to memory using a stride of ``%Stride`` between
-columns. The offset is computed using ``%Stride``'s bitwidth. If
-``<IsVolatile>`` is true, the intrinsic is considered a
-:ref:`volatile memory access <volatile>`.
+columns.  Independent of ``%Stride``'s bitwidth, the offset is computed using
+the target daya layout's pointer index type.  If ``<IsVolatile>`` is true, the
+intrinsic is considered a :ref:`volatile memory access <volatile>`.
 
 If the ``%Ptr`` argument is known to be aligned to some boundary, this can be
 specified as an attribute on the argument.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 79d93d0..30aeccd 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -147,6 +147,7 @@ Changes to the C API
 --------------------
 
 * Add `LLVMGetOrInsertFunction` to get or insert a function, replacing the combination of `LLVMGetNamedFunction` and `LLVMAddFunction`.
+* Allow `LLVMGetVolatile` to work with any kind of Instruction.
 
 Changes to the CodeGen infrastructure
 -------------------------------------
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 3d22f859..4e380d9 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -4757,7 +4757,7 @@ LLVM_C_ABI LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str,
 LLVM_C_ABI LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B,
                                                  const char *Str,
                                                  const char *Name);
-LLVM_C_ABI LLVMBool LLVMGetVolatile(LLVMValueRef MemoryAccessInst);
+LLVM_C_ABI LLVMBool LLVMGetVolatile(LLVMValueRef Inst);
 LLVM_C_ABI void LLVMSetVolatile(LLVMValueRef MemoryAccessInst,
                                 LLVMBool IsVolatile);
 LLVM_C_ABI LLVMBool LLVMGetWeak(LLVMValueRef CmpXchgInst);
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 4064d71..1af2761 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -86,89 +86,43 @@
 #include <limits>  // numeric_limits
 #include <type_traits>
 
+#include "llvm/Support/MathExtras.h"
+
 namespace llvm {
 
 namespace bitfields_details {
 
-/// A struct defining useful bit patterns for n-bits integer types.
-template <typename T, unsigned Bits> struct BitPatterns {
-  /// Bit patterns are forged using the equivalent `Unsigned` type because of
-  /// undefined operations over signed types (e.g. Bitwise shift operators).
-  /// Moreover same size casting from unsigned to signed is well defined but not
-  /// the other way around.
-  using Unsigned = std::make_unsigned_t<T>;
-  static_assert(sizeof(Unsigned) == sizeof(T), "Types must have same size");
-
-  static constexpr unsigned TypeBits = sizeof(Unsigned) * CHAR_BIT;
-  static_assert(TypeBits >= Bits, "n-bit must fit in T");
-
-  /// e.g. with TypeBits == 8 and Bits == 6.
-  static constexpr Unsigned AllZeros = Unsigned(0);                  // 00000000
-  static constexpr Unsigned AllOnes = ~Unsigned(0);                  // 11111111
-  static constexpr Unsigned Umin = AllZeros;                         // 00000000
-  static constexpr Unsigned Umax = AllOnes >> (TypeBits - Bits);     // 00111111
-  static constexpr Unsigned SignBitMask = Unsigned(1) << (Bits - 1); // 00100000
-  static constexpr Unsigned Smax = Umax >> 1U;                       // 00011111
-  static constexpr Unsigned Smin = ~Smax;                            // 11100000
-  static constexpr Unsigned SignExtend = Unsigned(Smin << 1U);       // 11000000
-};
-
-/// `Compressor` is used to manipulate the bits of a (possibly signed) integer
-/// type so it can be packed and unpacked into a `bits` sized integer,
-/// `Compressor` is specialized on signed-ness so no runtime cost is incurred.
-/// The `pack` method also checks that the passed in `UserValue` is valid.
-template <typename T, unsigned Bits, bool = std::is_unsigned<T>::value>
-struct Compressor {
-  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
-  using BP = BitPatterns<T, Bits>;
-
-  static T pack(T UserValue, T UserMaxValue) {
-    assert(UserValue <= UserMaxValue && "value is too big");
-    assert(UserValue <= BP::Umax && "value is too big");
-    return UserValue;
-  }
-
-  static T unpack(T StorageValue) { return StorageValue; }
-};
-
-template <typename T, unsigned Bits> struct Compressor<T, Bits, false> {
-  static_assert(std::is_signed<T>::value, "T must be signed");
-  using BP = BitPatterns<T, Bits>;
-
-  static T pack(T UserValue, T UserMaxValue) {
-    assert(UserValue <= UserMaxValue && "value is too big");
-    assert(UserValue <= T(BP::Smax) && "value is too big");
-    assert(UserValue >= T(BP::Smin) && "value is too small");
-    if (UserValue < 0)
-      UserValue &= ~BP::SignExtend;
-    return UserValue;
-  }
-
-  static T unpack(T StorageValue) {
-    if (StorageValue >= T(BP::SignBitMask))
-      StorageValue |= BP::SignExtend;
-    return StorageValue;
-  }
-};
-
 /// Impl is where Bifield description and Storage are put together to interact
 /// with values.
 template <typename Bitfield, typename StorageType> struct Impl {
   static_assert(std::is_unsigned<StorageType>::value,
                 "Storage must be unsigned");
   using IntegerType = typename Bitfield::IntegerType;
-  using C = Compressor<IntegerType, Bitfield::Bits>;
-  using BP = BitPatterns<StorageType, Bitfield::Bits>;
 
   static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT;
   static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask");
   static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask");
-  static constexpr StorageType Mask = BP::Umax << Bitfield::Shift;
+  static constexpr StorageType LowMask =
+      maskTrailingOnes<StorageType>(Bitfield::Bits);
+  static constexpr StorageType Mask = LowMask << Bitfield::Shift;
+
+  /// Validates that `UserValue` fits within the bitfield's range.
+  static void checkValue(IntegerType UserValue, IntegerType UserMaxValue) {
+    assert(UserValue <= UserMaxValue && "value is too big");
+    if constexpr (std::is_unsigned_v<IntegerType>) {
+      assert(isUInt<Bitfield::Bits>(UserValue) && "value is too big");
+    } else {
+      static_assert(std::is_signed_v<IntegerType>,
+                    "IntegerType must be signed");
+      assert(isInt<Bitfield::Bits>(UserValue) && "value is out of range");
+    }
+  }
 
   /// Checks `UserValue` is within bounds and packs it between `FirstBit` and
   /// `LastBit` of `Packed` leaving the rest unchanged.
   static void update(StorageType &Packed, IntegerType UserValue) {
-    const StorageType StorageValue = C::pack(UserValue, Bitfield::UserMaxValue);
+    checkValue(UserValue, Bitfield::UserMaxValue);
+    const StorageType StorageValue = UserValue & LowMask;
     Packed &= ~Mask;
     Packed |= StorageValue << Bitfield::Shift;
   }
@@ -177,7 +131,9 @@ template <typename Bitfield, typename StorageType> struct Impl {
   /// an`IntegerType`.
   static IntegerType extract(StorageType Packed) {
     const StorageType StorageValue = (Packed & Mask) >> Bitfield::Shift;
-    return C::unpack(StorageValue);
+    if constexpr (std::is_signed_v<IntegerType>)
+      return SignExtend64<Bitfield::Bits>(StorageValue);
+    return StorageValue;
   }
 
   /// Interprets bits between `FirstBit` and `LastBit` of `Packed` as
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 0ce7c57a..a96535c 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -17,6 +17,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstring>
+#include <initializer_list>
 #include <optional>
 
 namespace llvm {
@@ -85,55 +86,60 @@ public:
     return *this;
   }
 
+  StringSwitch &Cases(std::initializer_list<StringLiteral> CaseStrings,
+                      T Value) {
+    return CasesImpl(Value, CaseStrings);
+  }
+
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
-    return CasesImpl(Value, S0, S1);
+    return CasesImpl(Value, {S0, S1});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       T Value) {
-    return CasesImpl(Value, S0, S1, S2);
+    return CasesImpl(Value, {S0, S1, S2});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, T Value) {
-    return CasesImpl(Value, S0, S1, S2, S3);
+    return CasesImpl(Value, {S0, S1, S2, S3});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, T Value) {
-    return CasesImpl(Value, S0, S1, S2, S3, S4);
+    return CasesImpl(Value, {S0, S1, S2, S3, S4});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       T Value) {
-    return CasesImpl(Value, S0, S1, S2, S3, S4, S5);
+    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, T Value) {
-    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6);
+    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, T Value) {
-    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7);
+    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, StringLiteral S8,
                       T Value) {
-    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8);
+    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7, S8});
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, StringLiteral S8,
                       StringLiteral S9, T Value) {
-    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9);
+    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7, S8, S9});
   }
 
   // Case-insensitive case matchers.
@@ -156,23 +162,28 @@ public:
     return *this;
   }
 
+  StringSwitch &CasesLower(std::initializer_list<StringLiteral> CaseStrings,
+                           T Value) {
+    return CasesLowerImpl(Value, CaseStrings);
+  }
+
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
-    return CasesLowerImpl(Value, S0, S1);
+    return CasesLowerImpl(Value, {S0, S1});
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            T Value) {
-    return CasesLowerImpl(Value, S0, S1, S2);
+    return CasesLowerImpl(Value, {S0, S1, S2});
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            StringLiteral S3, T Value) {
-    return CasesLowerImpl(Value, S0, S1, S2, S3);
+    return CasesLowerImpl(Value, {S0, S1, S2, S3});
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            StringLiteral S3, StringLiteral S4, T Value) {
-    return CasesLowerImpl(Value, S0, S1, S2, S3, S4);
+    return CasesLowerImpl(Value, {S0, S1, S2, S3, S4});
   }
 
   [[nodiscard]] R Default(T Value) {
@@ -211,16 +222,21 @@ private:
     return false;
   }
 
-  template <typename... Args> StringSwitch &CasesImpl(T &Value, Args... Cases) {
+  StringSwitch &CasesImpl(T &Value,
+                          std::initializer_list<StringLiteral> Cases) {
     // Stop matching after the string is found.
-    (... || CaseImpl(Value, Cases));
+    for (StringLiteral S : Cases)
+      if (CaseImpl(Value, S))
+        break;
     return *this;
   }
 
-  template <typename... Args>
-  StringSwitch &CasesLowerImpl(T &Value, Args... Cases) {
+  StringSwitch &CasesLowerImpl(T &Value,
+                               std::initializer_list<StringLiteral> Cases) {
     // Stop matching after the string is found.
-    (... || CaseLowerImpl(Value, Cases));
+    for (StringLiteral S : Cases)
+      if (CaseLowerImpl(Value, S))
+        break;
     return *this;
   }
 };
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 164b46b..07a482d 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -182,6 +182,12 @@ m_scev_PtrToInt(const Op0_t &Op0) {
   return SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>(Op0);
 }
 
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVTruncateExpr, Op0_t>
+m_scev_Trunc(const Op0_t &Op0) {
+  return m_scev_Unary<SCEVTruncateExpr>(Op0);
+}
+
 /// Match a binary SCEV.
 template <typename SCEVTy, typename Op0_t, typename Op1_t,
           SCEV::NoWrapFlags WrapFlags = SCEV::FlagAnyWrap,
diff --git a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
index fa21eba..f06e7ce 100644
--- a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
+++ b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
@@ -10,6 +10,24 @@
 
 namespace llvm {
 
+namespace memprof {
+// Represents the eligibility status of a global variable for section prefix
+// annotation. Other than AnnotationOk, each enum value indicates a specific
+// reason for ineligibility.
+enum class AnnotationKind : uint8_t {
+  AnnotationOK,
+  DeclForLinker,
+  ExplicitSection,
+  ReservedName,
+};
+/// Returns the annotation kind of the global variable \p GV.
+AnnotationKind getAnnotationKind(const GlobalVariable &GV);
+
+/// Returns true if the annotation kind of the global variable \p GV is
+/// AnnotationOK.
+bool IsAnnotationOK(const GlobalVariable &GV);
+} // namespace memprof
+
 /// A class that holds the constants that represent static data and their
 /// profile information and provides methods to operate on them.
 class StaticDataProfileInfo {
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index c76c83d..ff3dd0d 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -514,6 +514,12 @@ enum NodeType {
   /// separately rounded operations.
   FMAD,
 
+  /// FMULADD - Performs a * b + c, with, or without, intermediate rounding.
+  /// It is expected that this will be illegal for most targets, as it usually
+  /// makes sense to split this or use an FMA. But some targets, such as
+  /// WebAssembly, can directly support these semantics.
+  FMULADD,
+
   /// FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.  NOTE: This
   /// DAG node does not require that X and Y have the same type, just that
   /// they are both floating point.  X and the result must have the same type.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 62c0806..df6ce0f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1850,9 +1850,11 @@ public:
   /// Get the specified node if it's already available, or else return NULL.
   LLVM_ABI SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
                                    ArrayRef<SDValue> Ops,
-                                   const SDNodeFlags Flags);
+                                   const SDNodeFlags Flags,
+                                   bool AllowCommute = false);
   LLVM_ABI SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
-                                   ArrayRef<SDValue> Ops);
+                                   ArrayRef<SDValue> Ops,
+                                   bool AllowCommute = false);
 
   /// Check if a node exists without modifying its flags.
   LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 1694a33..46b3d53 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -472,7 +472,7 @@ __OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr, KernelLaunchEn
 __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
-          FuncPtrTy, VoidPtr, VoidPtrPtr, SizeTy)
+          FuncPtrTy, FuncPtrTy, VoidPtrPtr, SizeTy)
 __OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
 __OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
 __OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
diff --git a/llvm/include/llvm/IR/ConstantFPRange.h b/llvm/include/llvm/IR/ConstantFPRange.h
index 39dc7c1..e772095 100644
--- a/llvm/include/llvm/IR/ConstantFPRange.h
+++ b/llvm/include/llvm/IR/ConstantFPRange.h
@@ -230,6 +230,19 @@ public:
   /// Return a new range representing the possible values resulting
   /// from a subtraction of a value in this range and a value in \p Other.
   LLVM_ABI ConstantFPRange sub(const ConstantFPRange &Other) const;
+
+  /// Return a new range representing the possible values resulting
+  /// from a multiplication of a value in this range and a value in \p Other.
+  LLVM_ABI ConstantFPRange mul(const ConstantFPRange &Other) const;
+
+  /// Return a new range representing the possible values resulting
+  /// from a division of a value in this range and a value in
+  /// \p Other.
+  LLVM_ABI ConstantFPRange div(const ConstantFPRange &Other) const;
+
+  /// Flush denormal values to zero according to the specified mode.
+  /// For dynamic mode, we return the union of all possible results.
+  LLVM_ABI void flushDenormals(DenormalMode::DenormalModeKind Mode);
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const ConstantFPRange &CR) {
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 041a4ce..dacda0a 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2548,6 +2548,11 @@ public:
       std::optional<RoundingMode> Rounding = std::nullopt,
       std::optional<fp::ExceptionBehavior> Except = std::nullopt);
 
+  LLVM_ABI Value *CreateSelectWithUnknownProfile(Value *C, Value *True,
+                                                 Value *False,
+                                                 StringRef PassName,
+                                                 const Twine &Name = "");
+
   LLVM_ABI Value *CreateSelect(Value *C, Value *True, Value *False,
                                const Twine &Name = "",
                                Instruction *MDFrom = nullptr);
diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h
index 48fc600..39a08d4 100644
--- a/llvm/include/llvm/Support/DebugCounter.h
+++ b/llvm/include/llvm/Support/DebugCounter.h
@@ -178,6 +178,7 @@ protected:
     std::string Desc;
     SmallVector<Chunk> Chunks;
   };
+  bool handleCounterIncrement(CounterInfo &Info);
 
   DenseMap<unsigned, CounterInfo> Counters;
   CounterVector RegisteredCounters;
@@ -188,6 +189,8 @@ protected:
 
   bool ShouldPrintCounter = false;
 
+  bool ShouldPrintCounterQueries = false;
+
   bool BreakOnLast = false;
 };
 
diff --git a/llvm/include/llvm/Support/Format.h b/llvm/include/llvm/Support/Format.h
index 34b224d..b549341 100644
--- a/llvm/include/llvm/Support/Format.h
+++ b/llvm/include/llvm/Support/Format.h
@@ -78,9 +78,20 @@ public:
 /// printed, this synthesizes the string into a temporary buffer provided and
 /// returns whether or not it is big enough.
 
+namespace detail {
+template <typename T> struct decay_if_c_char_array {
+  using type = T;
+};
+template <std::size_t N> struct decay_if_c_char_array<char[N]> {
+  using type = const char *;
+};
+template <typename T>
+using decay_if_c_char_array_t = typename decay_if_c_char_array<T>::type;
+} // namespace detail
+
 template <typename... Ts>
 class format_object final : public format_object_base {
-  std::tuple<Ts...> Vals;
+  std::tuple<detail::decay_if_c_char_array_t<Ts>...> Vals;
 
   template <std::size_t... Is>
   int snprint_tuple(char *Buffer, unsigned BufferSize,
@@ -96,7 +107,7 @@ public:
   format_object(const char *fmt, const Ts &... vals)
       : format_object_base(fmt), Vals(vals...) {
     static_assert(
-        (std::is_scalar_v<Ts> && ...),
+        (std::is_scalar_v<detail::decay_if_c_char_array_t<Ts>> && ...),
         "format can't be used with non fundamental / non pointer type");
   }
 
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 632be7a..07a858f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -535,6 +535,7 @@ def fdiv       : SDNode<"ISD::FDIV"       , SDTFPBinOp>;
 def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
 def fma        : SDNode<"ISD::FMA"        , SDTFPTernaryOp, [SDNPCommutative]>;
 def fmad       : SDNode<"ISD::FMAD"       , SDTFPTernaryOp, [SDNPCommutative]>;
+def fmuladd    : SDNode<"ISD::FMULADD"    , SDTFPTernaryOp, [SDNPCommutative]>;
 def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
 def fminnum    : SDNode<"ISD::FMINNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index b1fca55..2ac58a5 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -161,6 +161,8 @@ inline static bool isAltFmt(unsigned VType) { return VType & 0x100; }
 
 LLVM_ABI void printVType(unsigned VType, raw_ostream &OS);
 
+LLVM_ABI void printXSfmmVType(unsigned VType, raw_ostream &OS);
+
 LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
 
 LLVM_ABI std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL,
diff --git a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
index 558984f..eb2b34d 100644
--- a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
@@ -12,9 +12,7 @@
 #ifndef LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
 #define LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
 
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
 
 // True if I is trivially rematerialzable, e.g. InsertElementInst
 LLVM_ABI bool isTriviallyMaterializable(Instruction &I);
@@ -24,8 +22,6 @@ LLVM_ABI void
 doRematerializations(Function &F, SuspendCrossingInfo &Checker,
                      std::function<bool(Instruction &)> IsMaterializable);
 
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
 
 #endif // LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
diff --git a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
index 6cdf83c0..356f9ca 100644
--- a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
@@ -13,9 +13,7 @@
 #ifndef LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
 #define LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
 
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
 
 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
 
@@ -38,6 +36,7 @@ void collectSpillsAndAllocasFromInsts(
     SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
     const SuspendCrossingInfo &Checker, const DominatorTree &DT,
     const coro::Shape &Shape);
+
 void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
                               const SuspendCrossingInfo &Checker);
 
@@ -52,8 +51,6 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &DT,
 BasicBlock::iterator getSpillInsertionPt(const coro::Shape &, Value *Def,
                                          const DominatorTree &DT);
 
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
 
 #endif // LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
index 48e8c86..2db3f6d4 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
 #define LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Support/Compiler.h"
@@ -79,6 +78,10 @@ public:
   LLVM_ABI void
   RewriteAllUses(DominatorTree *DT,
                  SmallVectorImpl<PHINode *> *InsertedPHIs = nullptr);
+
+  /// Rewrite all uses and simplify the inserted PHI nodes.
+  /// Use this method to preserve behavior when replacing SSAUpdater.
+  void RewriteAndOptimizeAllUses(DominatorTree &DT);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/XRay/BlockIndexer.h b/llvm/include/llvm/XRay/BlockIndexer.h
index e9782da..155e6bd 100644
--- a/llvm/include/llvm/XRay/BlockIndexer.h
+++ b/llvm/include/llvm/XRay/BlockIndexer.h
@@ -19,8 +19,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 // The BlockIndexer will gather all related records associated with a
 // process+thread and group them by 'Block'.
@@ -63,7 +62,6 @@ public:
   Error flush();
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_BLOCKINDEXER_H
diff --git a/llvm/include/llvm/XRay/BlockPrinter.h b/llvm/include/llvm/XRay/BlockPrinter.h
index caf78c5..81944a5 100644
--- a/llvm/include/llvm/XRay/BlockPrinter.h
+++ b/llvm/include/llvm/XRay/BlockPrinter.h
@@ -18,8 +18,7 @@
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/RecordPrinter.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class LLVM_ABI BlockPrinter : public RecordVisitor {
   enum class State {
@@ -55,7 +54,6 @@ public:
   void reset() { CurrentState = State::Start; }
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_BLOCKPRINTER_H
diff --git a/llvm/include/llvm/XRay/BlockVerifier.h b/llvm/include/llvm/XRay/BlockVerifier.h
index b88785c..5e7b25c 100644
--- a/llvm/include/llvm/XRay/BlockVerifier.h
+++ b/llvm/include/llvm/XRay/BlockVerifier.h
@@ -16,8 +16,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class LLVM_ABI BlockVerifier : public RecordVisitor {
 public:
@@ -64,7 +63,6 @@ public:
   void reset();
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_BLOCKVERIFIER_H
diff --git a/llvm/include/llvm/XRay/FDRLogBuilder.h b/llvm/include/llvm/XRay/FDRLogBuilder.h
index f07c446..5f7b815 100644
--- a/llvm/include/llvm/XRay/FDRLogBuilder.h
+++ b/llvm/include/llvm/XRay/FDRLogBuilder.h
@@ -10,8 +10,7 @@
 
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// The LogBuilder class allows for creating ad-hoc collections of records
 /// through the `add<...>(...)` function. An example use of this API is in
@@ -34,7 +33,6 @@ public:
   std::vector<std::unique_ptr<Record>> consume() { return std::move(Records); }
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRLOGBUILDER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 473777f..13bb711 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -15,8 +15,7 @@
 #include <memory>
 #include <vector>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class RecordConsumer {
 public:
@@ -48,7 +47,6 @@ public:
   Error consume(std::unique_ptr<Record> R) override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRRECORDCONSUMER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordProducer.h b/llvm/include/llvm/XRay/FDRRecordProducer.h
index 083b571..b953f62 100644
--- a/llvm/include/llvm/XRay/FDRRecordProducer.h
+++ b/llvm/include/llvm/XRay/FDRRecordProducer.h
@@ -14,8 +14,7 @@
 #include "llvm/XRay/XRayRecord.h"
 #include <memory>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class RecordProducer {
 public:
@@ -45,7 +44,6 @@ public:
   Expected<std::unique_ptr<Record>> produce() override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRRECORDPRODUCER_H
diff --git a/llvm/include/llvm/XRay/FDRRecords.h b/llvm/include/llvm/XRay/FDRRecords.h
index 7ee8db6..91689cae 100644
--- a/llvm/include/llvm/XRay/FDRRecords.h
+++ b/llvm/include/llvm/XRay/FDRRecords.h
@@ -23,8 +23,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class RecordVisitor;
 class RecordInitializer;
@@ -444,7 +443,6 @@ public:
   Error visit(TypedEventRecord &) override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRRECORDS_H
diff --git a/llvm/include/llvm/XRay/FDRTraceExpander.h b/llvm/include/llvm/XRay/FDRTraceExpander.h
index 197c123..ca400c9 100644
--- a/llvm/include/llvm/XRay/FDRTraceExpander.h
+++ b/llvm/include/llvm/XRay/FDRTraceExpander.h
@@ -17,8 +17,7 @@
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class TraceExpander : public RecordVisitor {
   // Type-erased callback for handling individual XRayRecord instances.
@@ -56,7 +55,6 @@ public:
   Error flush();
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRTRACEEXPANDER_H
diff --git a/llvm/include/llvm/XRay/FDRTraceWriter.h b/llvm/include/llvm/XRay/FDRTraceWriter.h
index a3dc58e..957039d 100644
--- a/llvm/include/llvm/XRay/FDRTraceWriter.h
+++ b/llvm/include/llvm/XRay/FDRTraceWriter.h
@@ -18,8 +18,7 @@
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// The FDRTraceWriter allows us to hand-craft an XRay Flight Data Recorder
 /// (FDR) mode log file. This is used primarily for testing, generating
@@ -50,7 +49,6 @@ private:
   support::endian::Writer OS;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRTRACEWRITER_H
diff --git a/llvm/include/llvm/XRay/FileHeaderReader.h b/llvm/include/llvm/XRay/FileHeaderReader.h
index ecdb975..758ca29 100644
--- a/llvm/include/llvm/XRay/FileHeaderReader.h
+++ b/llvm/include/llvm/XRay/FileHeaderReader.h
@@ -19,15 +19,13 @@
 #include "llvm/XRay/XRayRecord.h"
 #include <cstdint>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// Convenience function for loading the file header given a data extractor at a
 /// specified offset.
 LLVM_ABI Expected<XRayFileHeader>
 readBinaryFormatHeader(DataExtractor &HeaderExtractor, uint64_t &OffsetPtr);
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FILEHEADERREADER_H
diff --git a/llvm/include/llvm/XRay/Graph.h b/llvm/include/llvm/XRay/Graph.h
index 07b418b..8521e09 100644
--- a/llvm/include/llvm/XRay/Graph.h
+++ b/llvm/include/llvm/XRay/Graph.h
@@ -23,8 +23,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Error.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// A Graph object represents a Directed Graph and is used in XRay to compute
 /// and store function call graphs and associated statistical information.
@@ -485,6 +484,6 @@ public:
     return p;
   }
 };
-}
-}
+} // namespace llvm::xray
+
 #endif
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index b5371478..c5e7ebf 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -23,9 +23,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace llvm {
-
-namespace xray {
+namespace llvm::xray {
 
 // Forward declare to make a friend.
 class InstrumentationMap;
@@ -102,11 +100,11 @@ public:
   const SledContainer &sleds() const { return Sleds; };
 };
 
-} // end namespace xray
-
-namespace yaml {
+} // end namespace llvm::xray
 
-template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
+namespace llvm {
+template <>
+struct yaml::ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
   static void enumeration(IO &IO, xray::SledEntry::FunctionKinds &Kind) {
     IO.enumCase(Kind, "function-enter", xray::SledEntry::FunctionKinds::ENTRY);
     IO.enumCase(Kind, "function-exit", xray::SledEntry::FunctionKinds::EXIT);
@@ -118,7 +116,7 @@ template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
   }
 };
 
-template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
+template <> struct yaml::MappingTraits<xray::YAMLXRaySledEntry> {
   static void mapping(IO &IO, xray::YAMLXRaySledEntry &Entry) {
     IO.mapRequired("id", Entry.FuncId);
     IO.mapRequired("address", Entry.Address);
@@ -131,10 +129,7 @@ template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
 
   static constexpr bool flow = true;
 };
-
-} // end namespace yaml
-
-} // end namespace llvm
+} // namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRaySledEntry)
 
diff --git a/llvm/include/llvm/XRay/Profile.h b/llvm/include/llvm/XRay/Profile.h
index e30c01e..b5b8dd2 100644
--- a/llvm/include/llvm/XRay/Profile.h
+++ b/llvm/include/llvm/XRay/Profile.h
@@ -22,8 +22,7 @@
 #include <utility>
 #include <vector>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class Profile;
 
@@ -144,7 +143,6 @@ public:
   bool empty() const { return Blocks.empty(); }
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif
diff --git a/llvm/include/llvm/XRay/RecordPrinter.h b/llvm/include/llvm/XRay/RecordPrinter.h
index 5d2c277..3281221 100644
--- a/llvm/include/llvm/XRay/RecordPrinter.h
+++ b/llvm/include/llvm/XRay/RecordPrinter.h
@@ -17,8 +17,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class LLVM_ABI RecordPrinter : public RecordVisitor {
   raw_ostream &OS;
@@ -44,7 +43,6 @@ public:
   Error visit(TypedEventRecord &) override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_RECORDPRINTER_H
diff --git a/llvm/include/llvm/XRay/Trace.h b/llvm/include/llvm/XRay/Trace.h
index 5e4e40a..13ada22 100644
--- a/llvm/include/llvm/XRay/Trace.h
+++ b/llvm/include/llvm/XRay/Trace.h
@@ -21,8 +21,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// A Trace object represents the records that have been loaded from XRay
 /// log files generated by instrumented binaries. We encapsulate the logic of
@@ -76,7 +75,6 @@ LLVM_ABI Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
 LLVM_ABI Expected<Trace> loadTrace(const DataExtractor &Extractor,
                                    bool Sort = false);
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_TRACE_H
diff --git a/llvm/include/llvm/XRay/XRayRecord.h b/llvm/include/llvm/XRay/XRayRecord.h
index 238bf3d..8f3440c 100644
--- a/llvm/include/llvm/XRay/XRayRecord.h
+++ b/llvm/include/llvm/XRay/XRayRecord.h
@@ -18,8 +18,7 @@
 #include <vector>
 #include <string>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// XRay traces all have a header providing some top-matter information useful
 /// to help tools determine how to interpret the information available in the
@@ -98,7 +97,6 @@ struct XRayRecord {
   std::string Data;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_XRAYRECORD_H
diff --git a/llvm/include/llvm/XRay/YAMLXRayRecord.h b/llvm/include/llvm/XRay/YAMLXRayRecord.h
index 6062606..6bf4f1d 100644
--- a/llvm/include/llvm/XRay/YAMLXRayRecord.h
+++ b/llvm/include/llvm/XRay/YAMLXRayRecord.h
@@ -17,8 +17,7 @@
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 struct YAMLXRayFileHeader {
   uint16_t Version;
@@ -46,13 +45,12 @@ struct YAMLXRayTrace {
   std::vector<YAMLXRayRecord> Records;
 };
 
-} // namespace xray
-
-namespace yaml {
+} // namespace llvm::xray
 
+namespace llvm {
 // YAML Traits
 // -----------
-template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
+template <> struct yaml::ScalarEnumerationTraits<xray::RecordTypes> {
   static void enumeration(IO &IO, xray::RecordTypes &Type) {
     IO.enumCase(Type, "function-enter", xray::RecordTypes::ENTER);
     IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
@@ -63,7 +61,7 @@ template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
   }
 };
 
-template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayFileHeader> {
   static void mapping(IO &IO, xray::YAMLXRayFileHeader &Header) {
     IO.mapRequired("version", Header.Version);
     IO.mapRequired("type", Header.Type);
@@ -73,7 +71,7 @@ template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
   }
 };
 
-template <> struct MappingTraits<xray::YAMLXRayRecord> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayRecord> {
   static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
     IO.mapRequired("type", Record.RecordType);
     IO.mapOptional("func-id", Record.FuncId);
@@ -90,7 +88,7 @@ template <> struct MappingTraits<xray::YAMLXRayRecord> {
   static constexpr bool flow = true;
 };
 
-template <> struct MappingTraits<xray::YAMLXRayTrace> {
+template <> struct yaml::MappingTraits<llvm::xray::YAMLXRayTrace> {
   static void mapping(IO &IO, xray::YAMLXRayTrace &Trace) {
     // A trace file contains two parts, the header and the list of all the
     // trace records.
@@ -98,8 +96,6 @@ template <> struct MappingTraits<xray::YAMLXRayTrace> {
     IO.mapRequired("records", Trace.Records);
   }
 };
-
-} // namespace yaml
 } // namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRayRecord)
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b5b4cd9..00c3dbb 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -5419,20 +5419,15 @@ static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
   if (SourceBits != NewBits)
     return nullptr;
 
-  const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
-  const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
-  if (!SExt && !ZExt)
-    return nullptr;
-  const SCEVTruncateExpr *Trunc =
-      SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
-           : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
-  if (!Trunc)
-    return nullptr;
-  const SCEV *X = Trunc->getOperand();
-  if (X != SymbolicPHI)
-    return nullptr;
-  Signed = SExt != nullptr;
-  return Trunc->getType();
+  if (match(Op, m_scev_SExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = true;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  if (match(Op, m_scev_ZExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = false;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  return nullptr;
 }
 
 static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
@@ -15428,20 +15423,18 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
   // Try to match 'zext (trunc A to iB) to iY', which is used
   // for URem with constant power-of-2 second operands. Make sure the size of
   // the operand A matches the size of the whole expressions.
-  if (const auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Expr))
-    if (const auto *Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0))) {
-      LHS = Trunc->getOperand();
-      // Bail out if the type of the LHS is larger than the type of the
-      // expression for now.
-      if (getTypeSizeInBits(LHS->getType()) >
-          getTypeSizeInBits(Expr->getType()))
-        return false;
-      if (LHS->getType() != Expr->getType())
-        LHS = getZeroExtendExpr(LHS, Expr->getType());
-      RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
-                        << getTypeSizeInBits(Trunc->getType()));
-      return true;
-    }
+  if (match(Expr, m_scev_ZExt(m_scev_Trunc(m_SCEV(LHS))))) {
+    Type *TruncTy = cast<SCEVZeroExtendExpr>(Expr)->getOperand()->getType();
+    // Bail out if the type of the LHS is larger than the type of the
+    // expression for now.
+    if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(Expr->getType()))
+      return false;
+    if (LHS->getType() != Expr->getType())
+      LHS = getZeroExtendExpr(LHS, Expr->getType());
+    RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
+                      << getTypeSizeInBits(TruncTy));
+    return true;
+  }
   const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
   if (Add == nullptr || Add->getNumOperands() != 2)
     return false;
diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
index b036b2d..1f751ee 100644
--- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp
+++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
@@ -6,6 +6,46 @@
 #include "llvm/ProfileData/InstrProf.h"
 
 using namespace llvm;
+
+namespace llvm {
+namespace memprof {
+// Returns true iff the global variable has custom section either by
+// __attribute__((section("name")))
+// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
+// or #pragma clang section directives
+// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
+static bool hasExplicitSectionName(const GlobalVariable &GVar) {
+  if (GVar.hasSection())
+    return true;
+
+  auto Attrs = GVar.getAttributes();
+  if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
+      Attrs.hasAttribute("relro-section") ||
+      Attrs.hasAttribute("rodata-section"))
+    return true;
+  return false;
+}
+
+AnnotationKind getAnnotationKind(const GlobalVariable &GV) {
+  if (GV.isDeclarationForLinker())
+    return AnnotationKind::DeclForLinker;
+  // Skip 'llvm.'-prefixed global variables conservatively because they are
+  // often handled specially,
+  StringRef Name = GV.getName();
+  if (Name.starts_with("llvm."))
+    return AnnotationKind::ReservedName;
+  // Respect user-specified custom data sections.
+  if (hasExplicitSectionName(GV))
+    return AnnotationKind::ExplicitSection;
+  return AnnotationKind::AnnotationOK;
+}
+
+bool IsAnnotationOK(const GlobalVariable &GV) {
+  return getAnnotationKind(GV) == AnnotationKind::AnnotationOK;
+}
+} // namespace memprof
+} // namespace llvm
+
 void StaticDataProfileInfo::addConstantProfileCount(
     const Constant *C, std::optional<uint64_t> Count) {
   if (!Count) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 6356d71..873ac8f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
-namespace llvm {
+using namespace llvm;
 
 AIXException::AIXException(AsmPrinter *A) : EHStreamer(A) {}
 
@@ -90,5 +90,3 @@ void AIXException::endFunction(const MachineFunction *MF) {
 
   emitExceptionInfoTable(LSDALabel, PerSym);
 }
-
-} // End of namespace llvm
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 260ce8f..93ae548 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -85,8 +85,7 @@ template <> struct llvm::DenseMapInfo<VariableID> {
 
 using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>;
 
-namespace std {
-template <> struct hash<VarLocInsertPt> {
+template <> struct std::hash<VarLocInsertPt> {
   using argument_type = VarLocInsertPt;
   using result_type = std::size_t;
 
@@ -94,7 +93,6 @@ template <> struct hash<VarLocInsertPt> {
     return std::hash<void *>()(Arg.getOpaqueValue());
   }
 };
-} // namespace std
 
 /// Helper class to build FunctionVarLocs, since that class isn't easy to
 /// modify. TODO: There's not a great deal of value in the split, it could be
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
index fd7df6b..47b7a88 100644
--- a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -207,9 +207,7 @@ bool ApplyCloning(MachineFunction &MF,
   }
   return AnyPathsCloned;
 }
-} // end anonymous namespace
 
-namespace llvm {
 class BasicBlockPathCloning : public MachineFunctionPass {
 public:
   static char ID;
@@ -229,7 +227,7 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
 
-} // namespace llvm
+} // namespace
 
 char BasicBlockPathCloning::ID = 0;
 INITIALIZE_PASS_BEGIN(
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 28e6728..1846880 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -31,7 +31,7 @@
 
 using namespace llvm;
 
-namespace llvm {
+namespace {
 
 class BreakFalseDeps : public MachineFunctionPass {
 private:
@@ -95,7 +95,7 @@ private:
   void processUndefReads(MachineBasicBlock *);
 };
 
-} // namespace llvm
+} // namespace
 
 #define DEBUG_TYPE "break-false-deps"
 
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 6c2a5a7..87ada87 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -126,8 +126,7 @@ hash_code hash_value(const ComplexValue &Arg) {
 } // end namespace
 typedef SmallVector<struct ComplexValue, 2> ComplexValues;
 
-namespace llvm {
-template <> struct DenseMapInfo<ComplexValue> {
+template <> struct llvm::DenseMapInfo<ComplexValue> {
   static inline ComplexValue getEmptyKey() {
     return {DenseMapInfo<Value *>::getEmptyKey(),
             DenseMapInfo<Value *>::getEmptyKey()};
@@ -144,7 +143,6 @@ template <> struct DenseMapInfo<ComplexValue> {
     return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
   }
 };
-} // end namespace llvm
 
 namespace {
 template <typename T, typename IterT>
diff --git a/llvm/lib/CodeGen/EdgeBundles.cpp b/llvm/lib/CodeGen/EdgeBundles.cpp
index f4335396..50dd66f 100644
--- a/llvm/lib/CodeGen/EdgeBundles.cpp
+++ b/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -81,13 +81,10 @@ void EdgeBundles::init() {
   }
 }
 
-namespace llvm {
-
 /// Specialize WriteGraph, the standard implementation won't work.
-template<>
-raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
-                          bool ShortNames,
-                          const Twine &Title) {
+template <>
+raw_ostream &llvm::WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+                                bool ShortNames, const Twine &Title) {
   const MachineFunction *MF = G.getMachineFunction();
 
   O << "digraph {\n";
@@ -107,8 +104,6 @@ raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
   return O;
 }
 
-} // end namespace llvm
-
 /// view - Visualize the annotated bipartite CFG with Graphviz.
 void EdgeBundles::view() const {
   ViewGraph(*this, "EdgeBundles");
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index c500357..04c7008 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -1036,6 +1036,7 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
         continue;
 
       addToWorklist(I, Worklist);
+      Modified = true;
       break;
     }
     default:
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 90c60d4..3812823 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1975,6 +1975,44 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
 
     break;
   }
+  case TargetOpcode::G_SUB: {
+    Register Src2 = MI.getOperand(2).getReg();
+    unsigned Src2NumSignBits =
+        computeNumSignBits(Src2, DemandedElts, Depth + 1);
+    if (Src2NumSignBits == 1)
+      return 1; // Early out.
+
+    // Handle NEG.
+    Register Src1 = MI.getOperand(1).getReg();
+    KnownBits Known1 = getKnownBits(Src1, DemandedElts, Depth);
+    if (Known1.isZero()) {
+      KnownBits Known2 = getKnownBits(Src2, DemandedElts, Depth);
+      // If the input is known to be 0 or 1, the output is 0/-1, which is all
+      // sign bits set.
+      if ((Known2.Zero | 1).isAllOnes())
+        return TyBits;
+
+      // If the input is known to be positive (the sign bit is known clear),
+      // the output of the NEG has, at worst, the same number of sign bits as
+      // the input.
+      if (Known2.isNonNegative()) {
+        FirstAnswer = Src2NumSignBits;
+        break;
+      }
+
+      // Otherwise, we treat this like a SUB.
+    }
+
+    unsigned Src1NumSignBits =
+        computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (Src1NumSignBits == 1)
+      return 1; // Early Out.
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
+    break;
+  }
   case TargetOpcode::G_FCMP:
   case TargetOpcode::G_ICMP: {
     bool IsFP = Opcode == TargetOpcode::G_FCMP;
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 47640c4a..81ab317 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -587,16 +587,12 @@ public:
 } // namespace
 
 char GlobalMergeFuncPassWrapper::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMergeFuncPassWrapper, "global-merge-func",
-                      "Global merge function pass", false, false)
-INITIALIZE_PASS_END(GlobalMergeFuncPassWrapper, "global-merge-func",
-                    "Global merge function pass", false, false)
+INITIALIZE_PASS(GlobalMergeFuncPassWrapper, "global-merge-func",
+                "Global merge function pass", false, false)
 
-namespace llvm {
-ModulePass *createGlobalMergeFuncPass() {
+ModulePass *llvm::createGlobalMergeFuncPass() {
   return new GlobalMergeFuncPassWrapper();
 }
-} // namespace llvm
 
 GlobalMergeFuncPassWrapper::GlobalMergeFuncPassWrapper() : ModulePass(ID) {
   initializeGlobalMergeFuncPassWrapperPass(
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 3485a27..0e38017 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -101,15 +101,11 @@ static cl::opt<bool> EnablePrecomputePhysRegs(
 static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
-namespace llvm {
-
-cl::opt<bool> UseSegmentSetForPhysRegs(
+cl::opt<bool> llvm::UseSegmentSetForPhysRegs(
     "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
     cl::desc(
         "Use segment set for the computation of the live ranges of physregs."));
 
-} // end namespace llvm
-
 void LiveIntervalsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<LiveVariablesWrapperPass>();
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index e859765..5c78d98 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -29,20 +29,17 @@ using namespace mir2vec;
 STATISTIC(MIRVocabMissCounter,
           "Number of lookups to MIR entities not present in the vocabulary");
 
-namespace llvm {
-namespace mir2vec {
-cl::OptionCategory MIR2VecCategory("MIR2Vec Options");
+cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
 static cl::opt<std::string>
     VocabFile("mir2vec-vocab-path", cl::Optional,
               cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""),
               cl::cat(MIR2VecCategory));
-cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
-                         cl::desc("Weight for machine opcode embeddings"),
-                         cl::cat(MIR2VecCategory));
-} // namespace mir2vec
-} // namespace llvm
+cl::opt<float>
+    llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
+                             cl::desc("Weight for machine opcode embeddings"),
+                             cl::cat(MIR2VecCategory));
 
 //===----------------------------------------------------------------------===//
 // Vocabulary Implementation
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index f5146f5..d988a2a 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -40,7 +40,7 @@ cl::opt<bool> ImprovedFSDiscriminator(
     "improved-fs-discriminator", cl::Hidden, cl::init(false),
     cl::desc("New FS discriminators encoding (incompatible with the original "
              "encoding)"));
-}
+} // namespace llvm
 
 char MIRAddFSDiscriminators::ID = 0;
 
diff --git a/llvm/lib/CodeGen/MIRNamerPass.cpp b/llvm/lib/CodeGen/MIRNamerPass.cpp
index bc65700..cbf8867 100644
--- a/llvm/lib/CodeGen/MIRNamerPass.cpp
+++ b/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -23,10 +23,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-extern char &MIRNamerID;
-} // namespace llvm
-
 #define DEBUG_TYPE "mir-namer"
 
 namespace {
@@ -53,10 +49,9 @@ public:
 
     VRegRenamer Renamer(MF.getRegInfo());
 
-    unsigned BBIndex = 0;
     ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
-    for (auto &MBB : RPOT)
-      Changed |= Renamer.renameVRegs(MBB, BBIndex++);
+    for (const auto &[BBIndex, MBB] : enumerate(RPOT))
+      Changed |= Renamer.renameVRegs(MBB, BBIndex);
 
     return Changed;
   }
@@ -66,10 +61,4 @@ public:
 
 char MIRNamer::ID;
 
-char &llvm::MIRNamerID = MIRNamer::ID;
-
-INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
-                      false)
-
-INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
-                    false)
+INITIALIZE_PASS(MIRNamer, "mir-namer", "Rename Register Operands", false, false)
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bf8a6cd..96428cd 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -107,10 +107,8 @@ struct MFPrintState {
 
 } // end anonymous namespace
 
-namespace llvm::yaml {
-
 /// This struct serializes the LLVM IR module.
-template <> struct BlockScalarTraits<Module> {
+template <> struct yaml::BlockScalarTraits<Module> {
   static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
     Mod.print(OS, nullptr);
   }
@@ -121,8 +119,6 @@ template <> struct BlockScalarTraits<Module> {
   }
 };
 
-} // end namespace llvm::yaml
-
 static void printRegMIR(Register Reg, yaml::StringValue &Dest,
                         const TargetRegisterInfo *TRI) {
   raw_string_ostream OS(Dest.Value);
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index b2731b69..a72c2c4 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -97,7 +97,9 @@ static const bool EnableDevelopmentFeatures = false;
 /// this happens only in development mode. It's a no-op otherwise.
 namespace llvm {
 extern cl::opt<unsigned> EvictInterferenceCutoff;
+} // namespace llvm
 
+namespace {
 class RegAllocScoring : public MachineFunctionPass {
 public:
   static char ID;
@@ -124,11 +126,12 @@ public:
   /// Performs this pass
   bool runOnMachineFunction(MachineFunction &) override;
 };
+} // namespace
 
 char RegAllocScoring::ID = 0;
-FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); }
-
-} // namespace llvm
+FunctionPass *llvm::createRegAllocScoringPass() {
+  return new RegAllocScoring();
+}
 
 INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
                 "Register Allocation Scoring Pass", false, false)
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index e7fa082..26eb10f 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-block-freq"
 
-namespace llvm {
 static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
     "view-machine-block-freq-propagation-dags", cl::Hidden,
     cl::desc("Pop up a window to show a dag displaying how machine block "
@@ -44,6 +43,7 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
                clEnumValN(GVDT_Count, "count", "display a graph using the real "
                                                "profile count if available.")));
 
+namespace llvm {
 // Similar option above, but used to control BFI display only after MBP pass
 cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
     "view-block-layout-with-bfi", cl::Hidden,
@@ -69,15 +69,15 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=
 extern cl::opt<unsigned> ViewHotFreqPercent;
 
-static cl::opt<bool> PrintMachineBlockFreq(
-    "print-machine-bfi", cl::init(false), cl::Hidden,
-    cl::desc("Print the machine block frequency info."));
-
 // Command line option to specify the name of the function for block frequency
 // dump. Defined in Analysis/BlockFrequencyInfo.cpp.
 extern cl::opt<std::string> PrintBFIFuncName;
 } // namespace llvm
 
+static cl::opt<bool>
+    PrintMachineBlockFreq("print-machine-bfi", cl::init(false), cl::Hidden,
+                          cl::desc("Print the machine block frequency info."));
+
 static GVDAGType getGVDT() {
   if (ViewBlockLayoutWithBFI != GVDT_None)
     return ViewBlockLayoutWithBFI;
@@ -85,9 +85,7 @@ static GVDAGType getGVDT() {
   return ViewMachineBlockFreqPropagationDAG;
 }
 
-namespace llvm {
-
-template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
+template <> struct llvm::GraphTraits<MachineBlockFrequencyInfo *> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
   using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>;
@@ -116,7 +114,7 @@ using MBFIDOTGraphTraitsBase =
                           MachineBranchProbabilityInfo>;
 
 template <>
-struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+struct llvm::DOTGraphTraits<MachineBlockFrequencyInfo *>
     : public MBFIDOTGraphTraitsBase {
   const MachineFunction *CurFunc = nullptr;
   DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
@@ -159,8 +157,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
   }
 };
 
-} // end namespace llvm
-
 AnalysisKey MachineBlockFrequencyAnalysis::Key;
 
 MachineBlockFrequencyAnalysis::Result
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 2e92dd8..7ca4582 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -18,13 +18,8 @@
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfoWrapperPass,
-                      "machine-branch-prob",
-                      "Machine Branch Probability Analysis", false, true)
-INITIALIZE_PASS_END(MachineBranchProbabilityInfoWrapperPass,
-                    "machine-branch-prob",
-                    "Machine Branch Probability Analysis", false, true)
-
+INITIALIZE_PASS(MachineBranchProbabilityInfoWrapperPass, "machine-branch-prob",
+                "Machine Branch Probability Analysis", false, true)
 namespace llvm {
 cl::opt<unsigned>
     StaticLikelyProb("static-likely-prob",
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 224231c..bfa5ab2 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -719,43 +719,41 @@ MachineFunction::CallSiteInfo::CallSiteInfo(const CallBase &CB) {
   }
 }
 
-namespace llvm {
+template <>
+struct llvm::DOTGraphTraits<const MachineFunction *>
+    : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-  template<>
-  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
-    DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+  static std::string getGraphName(const MachineFunction *F) {
+    return ("CFG for '" + F->getName() + "' function").str();
+  }
 
-    static std::string getGraphName(const MachineFunction *F) {
-      return ("CFG for '" + F->getName() + "' function").str();
+  std::string getNodeLabel(const MachineBasicBlock *Node,
+                           const MachineFunction *Graph) {
+    std::string OutStr;
+    {
+      raw_string_ostream OSS(OutStr);
+
+      if (isSimple()) {
+        OSS << printMBBReference(*Node);
+        if (const BasicBlock *BB = Node->getBasicBlock())
+          OSS << ": " << BB->getName();
+      } else
+        Node->print(OSS);
     }
 
-    std::string getNodeLabel(const MachineBasicBlock *Node,
-                             const MachineFunction *Graph) {
-      std::string OutStr;
-      {
-        raw_string_ostream OSS(OutStr);
-
-        if (isSimple()) {
-          OSS << printMBBReference(*Node);
-          if (const BasicBlock *BB = Node->getBasicBlock())
-            OSS << ": " << BB->getName();
-        } else
-          Node->print(OSS);
-      }
-
-      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
-
-      // Process string output to make it nicer...
-      for (unsigned i = 0; i != OutStr.length(); ++i)
-        if (OutStr[i] == '\n') {                            // Left justify
-          OutStr[i] = '\\';
-          OutStr.insert(OutStr.begin()+i+1, 'l');
-        }
-      return OutStr;
-    }
-  };
+    if (OutStr[0] == '\n')
+      OutStr.erase(OutStr.begin());
 
-} // end namespace llvm
+    // Process string output to make it nicer...
+    for (unsigned i = 0; i != OutStr.length(); ++i)
+      if (OutStr[i] == '\n') { // Left justify
+        OutStr[i] = '\\';
+        OutStr.insert(OutStr.begin() + i + 1, 'l');
+      }
+    return OutStr;
+  }
+};
 
 void MachineFunction::viewCFG() const
 {
diff --git a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0f88a7b..5111322 100644
--- a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -60,13 +60,11 @@ char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
 INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
                 "Machine Function Printer", false, false)
 
-namespace llvm {
 /// Returns a newly-created MachineFunction Printer pass. The
 /// default banner is empty.
 ///
-MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
-                                                      const std::string &Banner){
+MachineFunctionPass *
+llvm::createMachineFunctionPrinterPass(raw_ostream &OS,
+                                       const std::string &Banner) {
   return new MachineFunctionPrinterPass(OS, Banner);
 }
-
-}
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index fdae3b4..9feb974 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -593,15 +593,12 @@ struct MachineOutliner : public ModulePass {
 
 char MachineOutliner::ID = 0;
 
-namespace llvm {
-ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
+ModulePass *llvm::createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
   MachineOutliner *OL = new MachineOutliner();
   OL->RunOutlinerMode = RunOutlinerMode;
   return OL;
 }
 
-} // namespace llvm
-
 INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
                 false)
 
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 89ed4da..a717d9e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -201,16 +201,15 @@ static cl::opt<unsigned> SwpMaxNumStores(
     cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
     cl::init(200));
 
-namespace llvm {
-
 // A command line option to enable the CopyToPhi DAG mutation.
-cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
-                                 cl::init(true),
-                                 cl::desc("Enable CopyToPhi DAG Mutation"));
+cl::opt<bool>
+    llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                             cl::init(true),
+                             cl::desc("Enable CopyToPhi DAG Mutation"));
 
 /// A command line argument to force pipeliner to use specified issue
 /// width.
-cl::opt<int> SwpForceIssueWidth(
+cl::opt<int> llvm::SwpForceIssueWidth(
     "pipeliner-force-issue-width",
     cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
     cl::init(-1));
@@ -226,8 +225,6 @@ static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
                clEnumValN(WindowSchedulingFlag::WS_Force, "force",
                           "Use window algorithm instead of SMS algorithm.")));
 
-} // end namespace llvm
-
 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
 char MachinePipeliner::ID = 0;
 #ifndef NDEBUG
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 299bcc4..3ed1045 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -176,9 +176,7 @@ STATISTIC(NumNodeOrderPostRA,
 STATISTIC(NumFirstValidPostRA,
           "Number of scheduling units chosen for FirstValid heuristic post-RA");
 
-namespace llvm {
-
-cl::opt<MISched::Direction> PreRADirection(
+cl::opt<MISched::Direction> llvm::PreRADirection(
     "misched-prera-direction", cl::Hidden,
     cl::desc("Pre reg-alloc list scheduling direction"),
     cl::init(MISched::Unspecified),
@@ -206,33 +204,31 @@ static cl::opt<bool>
     DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                            cl::desc("Print critical path length to stdout"));
 
-cl::opt<bool> VerifyScheduling(
+cl::opt<bool> llvm::VerifyScheduling(
     "verify-misched", cl::Hidden,
     cl::desc("Verify machine instrs before and after machine scheduling"));
 
 #ifndef NDEBUG
-cl::opt<bool> ViewMISchedDAGs(
+cl::opt<bool> llvm::ViewMISchedDAGs(
     "view-misched-dags", cl::Hidden,
     cl::desc("Pop up a window to show MISched dags after they are processed"));
-cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
-                        cl::desc("Print schedule DAGs"));
-cl::opt<bool> MISchedDumpReservedCycles(
+cl::opt<bool> llvm::PrintDAGs("misched-print-dags", cl::Hidden,
+                              cl::desc("Print schedule DAGs"));
+static cl::opt<bool> MISchedDumpReservedCycles(
     "misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
     cl::desc("Dump resource usage at schedule boundary."));
-cl::opt<bool> MischedDetailResourceBooking(
+static cl::opt<bool> MischedDetailResourceBooking(
     "misched-detail-resource-booking", cl::Hidden, cl::init(false),
     cl::desc("Show details of invoking getNextResoufceCycle."));
 #else
-const bool ViewMISchedDAGs = false;
-const bool PrintDAGs = false;
-const bool MischedDetailResourceBooking = false;
+const bool llvm::ViewMISchedDAGs = false;
+const bool llvm::PrintDAGs = false;
+static const bool MischedDetailResourceBooking = false;
 #ifdef LLVM_ENABLE_DUMP
-const bool MISchedDumpReservedCycles = false;
+static const bool MISchedDumpReservedCycles = false;
 #endif // LLVM_ENABLE_DUMP
 #endif // NDEBUG
 
-} // end namespace llvm
-
 #ifndef NDEBUG
 /// In some situations a few uninteresting nodes depend on nearly all other
 /// nodes in the graph, provide a cutoff to hide them.
@@ -2053,28 +2049,24 @@ public:
 
 } // end anonymous namespace
 
-namespace llvm {
-
 std::unique_ptr<ScheduleDAGMutation>
-createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI,
-                             bool ReorderWhileClustering) {
+llvm::createLoadClusterDAGMutation(const TargetInstrInfo *TII,
+                                   const TargetRegisterInfo *TRI,
+                                   bool ReorderWhileClustering) {
   return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
                                   TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
-createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI,
-                              bool ReorderWhileClustering) {
+llvm::createStoreClusterDAGMutation(const TargetInstrInfo *TII,
+                                    const TargetRegisterInfo *TRI,
+                                    bool ReorderWhileClustering) {
   return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
                                   TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
-} // end namespace llvm
-
 // Sorting all the loads/stores first, then for each load/store, checking the
 // following load/store one by one, until reach the first non-dependent one and
 // call target hook to see if they can cluster.
@@ -2304,16 +2296,12 @@ protected:
 
 } // end anonymous namespace
 
-namespace llvm {
-
 std::unique_ptr<ScheduleDAGMutation>
-createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
-                               const TargetRegisterInfo *TRI) {
+llvm::createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
+                                     const TargetRegisterInfo *TRI) {
   return std::make_unique<CopyConstrain>(TII, TRI);
 }
 
-} // end namespace llvm
-
 /// constrainLocalCopy handles two possibilities:
 /// 1) Local src:
 /// I0:     = dst
@@ -3445,14 +3433,13 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
 }
 #endif
 
-namespace llvm {
 /// Return true if this heuristic determines order.
 /// TODO: Consider refactor return type of these functions as integer or enum,
 /// as we may need to differentiate whether TryCand is better than Cand.
-bool tryLess(int TryVal, int CandVal,
-             GenericSchedulerBase::SchedCandidate &TryCand,
-             GenericSchedulerBase::SchedCandidate &Cand,
-             GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryLess(int TryVal, int CandVal,
+                   GenericSchedulerBase::SchedCandidate &TryCand,
+                   GenericSchedulerBase::SchedCandidate &Cand,
+                   GenericSchedulerBase::CandReason Reason) {
   if (TryVal < CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -3465,10 +3452,10 @@ bool tryLess(int TryVal, int CandVal,
   return false;
 }
 
-bool tryGreater(int TryVal, int CandVal,
-                GenericSchedulerBase::SchedCandidate &TryCand,
-                GenericSchedulerBase::SchedCandidate &Cand,
-                GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryGreater(int TryVal, int CandVal,
+                      GenericSchedulerBase::SchedCandidate &TryCand,
+                      GenericSchedulerBase::SchedCandidate &Cand,
+                      GenericSchedulerBase::CandReason Reason) {
   if (TryVal > CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -3481,9 +3468,9 @@ bool tryGreater(int TryVal, int CandVal,
   return false;
 }
 
-bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
-                GenericSchedulerBase::SchedCandidate &Cand,
-                SchedBoundary &Zone) {
+bool llvm::tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+                      GenericSchedulerBase::SchedCandidate &Cand,
+                      SchedBoundary &Zone) {
   if (Zone.isTop()) {
     // Prefer the candidate with the lesser depth, but only if one of them has
     // depth greater than the total latency scheduled so far, otherwise either
@@ -3513,7 +3500,6 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
   }
   return false;
 }
-} // end namespace llvm
 
 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop,
                       bool IsPostRA = false) {
@@ -3798,14 +3784,12 @@ void GenericScheduler::registerRoots() {
   }
 }
 
-namespace llvm {
-bool tryPressure(const PressureChange &TryP,
-                 const PressureChange &CandP,
-                 GenericSchedulerBase::SchedCandidate &TryCand,
-                 GenericSchedulerBase::SchedCandidate &Cand,
-                 GenericSchedulerBase::CandReason Reason,
-                 const TargetRegisterInfo *TRI,
-                 const MachineFunction &MF) {
+bool llvm::tryPressure(const PressureChange &TryP, const PressureChange &CandP,
+                       GenericSchedulerBase::SchedCandidate &TryCand,
+                       GenericSchedulerBase::SchedCandidate &Cand,
+                       GenericSchedulerBase::CandReason Reason,
+                       const TargetRegisterInfo *TRI,
+                       const MachineFunction &MF) {
   // If one candidate decreases and the other increases, go with it.
   // Invalid candidates have UnitInc==0.
   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
@@ -3838,7 +3822,7 @@ bool tryPressure(const PressureChange &TryP,
   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
 }
 
-unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+unsigned llvm::getWeakLeft(const SUnit *SU, bool isTop) {
   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
 }
 
@@ -3849,7 +3833,7 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) {
 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
 /// with the operation that produces or consumes the physreg. We'll do this when
 /// regalloc has support for parallel copies.
-int biasPhysReg(const SUnit *SU, bool isTop) {
+int llvm::biasPhysReg(const SUnit *SU, bool isTop) {
   const MachineInstr *MI = SU->getInstr();
 
   if (MI->isCopy()) {
@@ -3884,7 +3868,6 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
 
   return 0;
 }
-} // end namespace llvm
 
 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
@@ -4812,13 +4795,13 @@ static MachineSchedRegistry ShufflerRegistry(
 //===----------------------------------------------------------------------===//
 
 #ifndef NDEBUG
-namespace llvm {
 
-template<> struct GraphTraits<
-  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+template <>
+struct llvm::GraphTraits<ScheduleDAGMI *> : public GraphTraits<ScheduleDAG *> {
+};
 
-template<>
-struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAGMI *> : public DefaultDOTGraphTraits {
   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const ScheduleDAG *G) {
@@ -4878,7 +4861,6 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   }
 };
 
-} // end namespace llvm
 #endif // NDEBUG
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index c2d4aa0..9ac3f741 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -485,10 +485,7 @@ struct LoopBounds {
 
 // Specialize po_iterator_storage in order to prune the post-order traversal so
 // it is limited to the current loop and doesn't traverse the loop back edges.
-namespace llvm {
-
-template<>
-class po_iterator_storage<LoopBounds, true> {
+template <> class llvm::po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
 
 public:
@@ -519,8 +516,6 @@ public:
   }
 };
 
-} // end namespace llvm
-
 /// Compute the trace through MBB.
 void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Computing " << getName() << " trace through "
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index 087ac62..59c587c 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -9,7 +9,7 @@
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
 #include "llvm/ADT/STLExtras.h"
 
-namespace llvm {
+using namespace llvm;
 
 DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
   auto I = Strings.try_emplace(S);
@@ -43,5 +43,3 @@ NonRelocatableStringpool::getEntriesForEmission() const {
   });
   return Result;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 6f373a5..e9ffa85 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -76,8 +76,6 @@ using namespace llvm::safestack;
 
 #define DEBUG_TYPE "safe-stack"
 
-namespace llvm {
-
 STATISTIC(NumFunctions, "Total number of functions");
 STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
 STATISTIC(NumUnsafeStackRestorePointsFunctions,
@@ -89,8 +87,6 @@ STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
 STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
 STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
 
-} // namespace llvm
-
 /// Use __safestack_pointer_address even if the platform has a faster way of
 /// access safe stack pointer.
 static cl::opt<bool>
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index eae2e8c..3268c26 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1551,14 +1551,10 @@ LLVM_DUMP_METHOD void ILPValue::dump() const {
   dbgs() << *this << '\n';
 }
 
-namespace llvm {
-
 LLVM_ATTRIBUTE_UNUSED
-raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const ILPValue &Val) {
   Val.print(OS);
   return OS;
 }
 
-} // end namespace llvm
-
 #endif
diff --git a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index e7b1494..c80eade 100644
--- a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -16,57 +16,51 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace llvm {
-  template<>
-  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAG *> : public DefaultDOTGraphTraits {
 
-  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-    static std::string getGraphName(const ScheduleDAG *G) {
-      return std::string(G->MF.getName());
-    }
+  static std::string getGraphName(const ScheduleDAG *G) {
+    return std::string(G->MF.getName());
+  }
 
-    static bool renderGraphFromBottomUp() {
-      return true;
-    }
+  static bool renderGraphFromBottomUp() { return true; }
 
-    static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
-      return (Node->NumPreds > 10 || Node->NumSuccs > 10);
-    }
+  static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
+    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+  }
 
-    static std::string getNodeIdentifierLabel(const SUnit *Node,
-                                              const ScheduleDAG *Graph) {
-      std::string R;
-      raw_string_ostream OS(R);
-      OS << static_cast<const void *>(Node);
-      return R;
-    }
+  static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                            const ScheduleDAG *Graph) {
+    std::string R;
+    raw_string_ostream OS(R);
+    OS << static_cast<const void *>(Node);
+    return R;
+  }
 
-    /// If you want to override the dot attributes printed for a particular
-    /// edge, override this method.
-    static std::string getEdgeAttributes(const SUnit *Node,
-                                         SUnitIterator EI,
-                                         const ScheduleDAG *Graph) {
-      if (EI.isArtificialDep())
-        return "color=cyan,style=dashed";
-      if (EI.isCtrlDep())
-        return "color=blue,style=dashed";
-      return "";
-    }
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+                                       const ScheduleDAG *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
 
+  std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
+  static std::string getNodeAttributes(const SUnit *N,
+                                       const ScheduleDAG *Graph) {
+    return "shape=Mrecord";
+  }
 
-    std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
-    static std::string getNodeAttributes(const SUnit *N,
-                                         const ScheduleDAG *Graph) {
-      return "shape=Mrecord";
-    }
-
-    static void addCustomGraphFeatures(ScheduleDAG *G,
-                                       GraphWriter<ScheduleDAG*> &GW) {
-      return G->addCustomGraphFeatures(GW);
-    }
-  };
-}
+  static void addCustomGraphFeatures(ScheduleDAG *G,
+                                     GraphWriter<ScheduleDAG *> &GW) {
+    return G->addCustomGraphFeatures(GW);
+  }
+};
 
 std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
                                                        const ScheduleDAG *G) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b1accdd..e153842 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -509,6 +509,7 @@ namespace {
     SDValue visitFMUL(SDNode *N);
     template <class MatchContextClass> SDValue visitFMA(SDNode *N);
     SDValue visitFMAD(SDNode *N);
+    SDValue visitFMULADD(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
     SDValue visitFSQRT(SDNode *N);
@@ -1991,6 +1992,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FMUL:               return visitFMUL(N);
   case ISD::FMA:                return visitFMA<EmptyMatchContext>(N);
   case ISD::FMAD:               return visitFMAD(N);
+  case ISD::FMULADD:            return visitFMULADD(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
   case ISD::FSQRT:              return visitFSQRT(N);
@@ -18444,6 +18446,21 @@ SDValue DAGCombiner::visitFMAD(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFMULADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Constant fold FMULADD.
+  if (SDValue C =
+          DAG.FoldConstantArithmetic(ISD::FMULADD, DL, VT, {N0, N1, N2}))
+    return C;
+
+  return SDValue();
+}
+
 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
 // reciprocal.
 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 08af74c..4512c5c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5786,6 +5786,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FCOPYSIGN:
   case ISD::FMA:
   case ISD::FMAD:
+  case ISD::FMULADD:
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
@@ -5904,6 +5905,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
   case ISD::FCOSH:
   case ISD::FTANH:
   case ISD::FMA:
+  case ISD::FMULADD:
   case ISD::FMAD: {
     if (SNaN)
       return true;
@@ -7231,7 +7233,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   }
 
   // Handle fma/fmad special cases.
-  if (Opcode == ISD::FMA || Opcode == ISD::FMAD) {
+  if (Opcode == ISD::FMA || Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
     assert(Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
            Ops[2].getValueType() == VT && "FMA types must match!");
@@ -7242,7 +7244,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
       APFloat V1 = C1->getValueAPF();
       const APFloat &V2 = C2->getValueAPF();
       const APFloat &V3 = C3->getValueAPF();
-      if (Opcode == ISD::FMAD) {
+      if (Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
         V1.multiply(V2, APFloat::rmNearestTiesToEven);
         V1.add(V3, APFloat::rmNearestTiesToEven);
       } else
@@ -11844,25 +11846,38 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
 /// getNodeIfExists - Get the specified node if it's already available, or
 /// else return NULL.
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
-                                      ArrayRef<SDValue> Ops) {
+                                      ArrayRef<SDValue> Ops,
+                                      bool AllowCommute) {
   SDNodeFlags Flags;
   if (Inserter)
     Flags = Inserter->getFlags();
-  return getNodeIfExists(Opcode, VTList, Ops, Flags);
+  return getNodeIfExists(Opcode, VTList, Ops, Flags, AllowCommute);
 }
 
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
                                       ArrayRef<SDValue> Ops,
-                                      const SDNodeFlags Flags) {
-  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+                                      const SDNodeFlags Flags,
+                                      bool AllowCommute) {
+  if (VTList.VTs[VTList.NumVTs - 1] == MVT::Glue)
+    return nullptr;
+
+  auto Lookup = [&](ArrayRef<SDValue> LookupOps) -> SDNode * {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    AddNodeIDNode(ID, Opcode, VTList, LookupOps);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
+    if (SDNode *E = FindNodeOrInsertPos(ID, IP)) {
       E->intersectFlagsWith(Flags);
       return E;
     }
-  }
+    return nullptr;
+  };
+
+  if (SDNode *Existing = Lookup(Ops))
+    return Existing;
+
+  if (AllowCommute && TLI->isCommutativeBinOp(Opcode))
+    return Lookup({Ops[1], Ops[0]});
+
   return nullptr;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c21890a..0f2b518 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6996,6 +6996,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                getValue(I.getArgOperand(0)),
                                getValue(I.getArgOperand(1)),
                                getValue(I.getArgOperand(2)), Flags));
+    } else if (TLI.isOperationLegalOrCustom(ISD::FMULADD, VT)) {
+      // TODO: Support splitting the vector.
+      setValue(&I, DAG.getNode(ISD::FMULADD, sdl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)), Flags));
     } else {
       // TODO: Intrinsic calls should have fast-math-flags.
       SDValue Mul = DAG.getNode(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fcfbfe6..39cbfad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FMA:                        return "fma";
   case ISD::STRICT_FMA:                 return "strict_fma";
   case ISD::FMAD:                       return "fmad";
+  case ISD::FMULADD:                    return "fmuladd";
   case ISD::FREM:                       return "frem";
   case ISD::STRICT_FREM:                return "strict_frem";
   case ISD::FCOPYSIGN:                  return "fcopysign";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cc503d3..920dff9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7676,6 +7676,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     break;
   }
   case ISD::FMA:
+  case ISD::FMULADD:
   case ISD::FMAD: {
     if (!Flags.hasNoSignedZeros())
       break;
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 64e5cd5..95a9c3f 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -306,10 +306,7 @@ char &llvm::StackFrameLayoutAnalysisPassID = StackFrameLayoutAnalysisLegacy::ID;
 INITIALIZE_PASS(StackFrameLayoutAnalysisLegacy, "stack-frame-layout",
                 "Stack Frame Layout", false, false)
 
-namespace llvm {
 /// Returns a newly-created StackFrameLayout pass.
-MachineFunctionPass *createStackFrameLayoutAnalysisPass() {
+MachineFunctionPass *llvm::createStackFrameLayoutAnalysisPass() {
   return new StackFrameLayoutAnalysisLegacy();
 }
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/StaticDataAnnotator.cpp b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
index 53a9ab4..eac20120 100644
--- a/llvm/lib/CodeGen/StaticDataAnnotator.cpp
+++ b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
@@ -75,22 +75,11 @@ bool StaticDataAnnotator::runOnModule(Module &M) {
 
   bool Changed = false;
   for (auto &GV : M.globals()) {
-    if (GV.isDeclarationForLinker())
+    if (!llvm::memprof::IsAnnotationOK(GV))
       continue;
 
-    // The implementation below assumes prior passes don't set section prefixes,
-    // and specifically do 'assign' rather than 'update'. So report error if a
-    // section prefix is already set.
-    if (auto maybeSectionPrefix = GV.getSectionPrefix();
-        maybeSectionPrefix && !maybeSectionPrefix->empty())
-      llvm::report_fatal_error("Global variable " + GV.getName() +
-                               " already has a section prefix " +
-                               *maybeSectionPrefix);
-
     StringRef SectionPrefix = SDPI->getConstantSectionPrefix(&GV, PSI);
-    if (SectionPrefix.empty())
-      continue;
-
+    // setSectionPrefix returns true if the section prefix is updated.
     Changed |= GV.setSectionPrefix(SectionPrefix);
   }
 
diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp
index e22dc25..1593a40 100644
--- a/llvm/lib/CodeGen/StaticDataSplitter.cpp
+++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp
@@ -130,10 +130,8 @@ StaticDataSplitter::getConstant(const MachineOperand &Op,
   if (Op.isGlobal()) {
     // Find global variables with local linkage.
     const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal());
-    // Skip 'llvm.'-prefixed global variables conservatively because they are
-    // often handled specially, and skip those not in static data
-    // sections.
-    if (!GV || GV->getName().starts_with("llvm.") ||
+    // Skip those not eligible for annotation or not in static data sections.
+    if (!GV || !llvm::memprof::IsAnnotationOK(*GV) ||
         !inStaticDataSection(*GV, TM))
       return nullptr;
     return GV;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c23281a..060b1dd 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -815,7 +815,8 @@ void TargetLoweringBase::initActions() {
                         ISD::FTAN,           ISD::FACOS,
                         ISD::FASIN,          ISD::FATAN,
                         ISD::FCOSH,          ISD::FSINH,
-                        ISD::FTANH,          ISD::FATAN2},
+                        ISD::FTANH,          ISD::FATAN2,
+                        ISD::FMULADD},
                        VT, Expand);
 
     // Overflow operations default to expand
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index c9e4618..971f822 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -102,10 +102,8 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
   return true;
 }
 
-namespace llvm {
-
-Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
-                   unsigned SubIdx, const MachineRegisterInfo *MRI) {
+Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI,
+                         unsigned SubIdx, const MachineRegisterInfo *MRI) {
   return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
     if (!Reg)
       OS << "$noreg";
@@ -135,7 +133,7 @@ Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
   });
 }
 
-Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
     // Generic printout when TRI is missing.
     if (!TRI) {
@@ -158,7 +156,7 @@ Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
     if (Register::isVirtualRegister(Unit)) {
       OS << '%' << Register(Unit).virtRegIndex();
@@ -168,8 +166,9 @@ Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
-                              const TargetRegisterInfo *TRI) {
+Printable llvm::printRegClassOrBank(Register Reg,
+                                    const MachineRegisterInfo &RegInfo,
+                                    const TargetRegisterInfo *TRI) {
   return Printable([Reg, &RegInfo, TRI](raw_ostream &OS) {
     if (RegInfo.getRegClassOrNull(Reg))
       OS << StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
@@ -183,8 +182,6 @@ Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
   });
 }
 
-} // end namespace llvm
-
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 51d2e21..5b87686 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/IR/ConstantFPRange.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -506,3 +507,168 @@ ConstantFPRange ConstantFPRange::sub(const ConstantFPRange &Other) const {
   // fsub X, Y = fadd X, (fneg Y)
   return add(Other.negate());
 }
+
+void ConstantFPRange::flushDenormals(DenormalMode::DenormalModeKind Mode) {
+  if (Mode == DenormalMode::IEEE)
+    return;
+  FPClassTest Class = classify();
+  if (!(Class & fcSubnormal))
+    return;
+
+  auto &Sem = getSemantics();
+  // PreserveSign: PosSubnormal -> PosZero, NegSubnormal -> NegZero
+  // PositiveZero: PosSubnormal -> PosZero, NegSubnormal -> PosZero
+  // Dynamic:      PosSubnormal -> PosZero, NegSubnormal -> NegZero/PosZero
+  bool ZeroLowerNegative =
+      Mode != DenormalMode::PositiveZero && (Class & fcNegSubnormal);
+  bool ZeroUpperNegative =
+      Mode == DenormalMode::PreserveSign && !(Class & fcPosSubnormal);
+  assert((ZeroLowerNegative || !ZeroUpperNegative) &&
+         "ZeroLower is greater than ZeroUpper.");
+  Lower = minnum(Lower, APFloat::getZero(Sem, ZeroLowerNegative));
+  Upper = maxnum(Upper, APFloat::getZero(Sem, ZeroUpperNegative));
+}
+
+/// Represent a contiguous range of values sharing the same sign.
+struct SameSignRange {
+  bool HasZero;
+  bool HasNonZero;
+  bool HasInf;
+  // The lower and upper bounds of the range (inclusive).
+  // The sign is dropped and infinities are excluded.
+  std::optional<std::pair<APFloat, APFloat>> FinitePart;
+
+  explicit SameSignRange(const APFloat &Lower, const APFloat &Upper)
+      : HasZero(Lower.isZero()), HasNonZero(!Upper.isZero()),
+        HasInf(Upper.isInfinity()) {
+    assert(!Lower.isNegative() && !Upper.isNegative() &&
+           "The sign should be dropped.");
+    assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+           "Empty set.");
+    if (!Lower.isInfinity())
+      FinitePart = {Lower,
+                    HasInf ? APFloat::getLargest(Lower.getSemantics()) : Upper};
+  }
+};
+
+/// Split the range into positive and negative components.
+static void splitPosNeg(const APFloat &Lower, const APFloat &Upper,
+                        std::optional<SameSignRange> &NegPart,
+                        std::optional<SameSignRange> &PosPart) {
+  assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+         "Non-NaN part is empty.");
+  if (Lower.isNegative() == Upper.isNegative()) {
+    if (Lower.isNegative())
+      NegPart = SameSignRange{abs(Upper), abs(Lower)};
+    else
+      PosPart = SameSignRange{Lower, Upper};
+    return;
+  }
+  auto &Sem = Lower.getSemantics();
+  NegPart = SameSignRange{APFloat::getZero(Sem), abs(Lower)};
+  PosPart = SameSignRange{APFloat::getZero(Sem), Upper};
+}
+
+ConstantFPRange ConstantFPRange::mul(const ConstantFPRange &Other) const {
+  auto &Sem = getSemantics();
+  bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+                      ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+  if (isNaNOnly() || Other.isNaNOnly())
+    return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+                      /*MayBeSNaN=*/false);
+  std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+  splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+  splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+  APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+  APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+  auto Update = [&](std::optional<SameSignRange> &LHS,
+                    std::optional<SameSignRange> &RHS, bool Negative) {
+    if (!LHS || !RHS)
+      return;
+    // 0 * inf = QNaN
+    ResMayBeQNaN |= LHS->HasZero && RHS->HasInf;
+    ResMayBeQNaN |= RHS->HasZero && LHS->HasInf;
+    // NonZero * inf = inf
+    if ((LHS->HasInf && RHS->HasNonZero) || (RHS->HasInf && LHS->HasNonZero))
+      (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+    // Finite * Finite
+    if (LHS->FinitePart && RHS->FinitePart) {
+      APFloat NewLower = LHS->FinitePart->first * RHS->FinitePart->first;
+      APFloat NewUpper = LHS->FinitePart->second * RHS->FinitePart->second;
+      if (Negative) {
+        ResLower = minnum(ResLower, -NewUpper);
+        ResUpper = maxnum(ResUpper, -NewLower);
+      } else {
+        ResLower = minnum(ResLower, NewLower);
+        ResUpper = maxnum(ResUpper, NewUpper);
+      }
+    }
+  };
+  Update(LHSNeg, RHSNeg, /*Negative=*/false);
+  Update(LHSNeg, RHSPos, /*Negative=*/true);
+  Update(LHSPos, RHSNeg, /*Negative=*/true);
+  Update(LHSPos, RHSPos, /*Negative=*/false);
+  return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
+
+ConstantFPRange ConstantFPRange::div(const ConstantFPRange &Other) const {
+  auto &Sem = getSemantics();
+  bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+                      ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+  if (isNaNOnly() || Other.isNaNOnly())
+    return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+                      /*MayBeSNaN=*/false);
+  std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+  splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+  splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+  APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+  APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+  auto Update = [&](std::optional<SameSignRange> &LHS,
+                    std::optional<SameSignRange> &RHS, bool Negative) {
+    if (!LHS || !RHS)
+      return;
+    // inf / inf = QNaN 0 / 0 = QNaN
+    ResMayBeQNaN |= LHS->HasInf && RHS->HasInf;
+    ResMayBeQNaN |= LHS->HasZero && RHS->HasZero;
+    // It is not straightforward to infer HasNonZeroFinite = HasFinite &&
+    // HasNonZero. By definitions we have:
+    //   HasFinite = HasNonZeroFinite || HasZero
+    //   HasNonZero = HasNonZeroFinite || HasInf
+    // Since the range is contiguous, if both HasFinite and HasNonZero are true,
+    // HasNonZeroFinite must be true.
+    bool LHSHasNonZeroFinite = LHS->FinitePart && LHS->HasNonZero;
+    bool RHSHasNonZeroFinite = RHS->FinitePart && RHS->HasNonZero;
+    // inf / Finite = inf FiniteNonZero / 0 = inf
+    if ((LHS->HasInf && RHS->FinitePart) ||
+        (LHSHasNonZeroFinite && RHS->HasZero))
+      (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+    // Finite / inf = 0
+    if (LHS->FinitePart && RHS->HasInf) {
+      APFloat Zero = APFloat::getZero(Sem, /*Negative=*/Negative);
+      ResLower = minnum(ResLower, Zero);
+      ResUpper = maxnum(ResUpper, Zero);
+    }
+    // Finite / FiniteNonZero
+    if (LHS->FinitePart && RHSHasNonZeroFinite) {
+      assert(!RHS->FinitePart->second.isZero() &&
+             "Divisor should be non-zero.");
+      APFloat NewLower = LHS->FinitePart->first / RHS->FinitePart->second;
+      APFloat NewUpper = LHS->FinitePart->second /
+                         (RHS->FinitePart->first.isZero()
+                              ? APFloat::getSmallest(Sem, /*Negative=*/false)
+                              : RHS->FinitePart->first);
+      if (Negative) {
+        ResLower = minnum(ResLower, -NewUpper);
+        ResUpper = maxnum(ResUpper, -NewLower);
+      } else {
+        ResLower = minnum(ResLower, NewLower);
+        ResUpper = maxnum(ResUpper, NewUpper);
+      }
+    }
+  };
+  Update(LHSNeg, RHSNeg, /*Negative=*/false);
+  Update(LHSNeg, RHSPos, /*Negative=*/true);
+  Update(LHSPos, RHSNeg, /*Negative=*/true);
+  Update(LHSPos, RHSPos, /*Negative=*/false);
+  return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 2c2950c..cbce8bd 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -667,8 +667,11 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
     if (CE->getOpcode() == Instruction::Sub) {
       ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0));
       ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1));
-      if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt &&
-          RHS->getOpcode() == Instruction::PtrToInt) {
+      if (LHS && RHS &&
+          (LHS->getOpcode() == Instruction::PtrToInt ||
+           LHS->getOpcode() == Instruction::PtrToAddr) &&
+          (RHS->getOpcode() == Instruction::PtrToInt ||
+           RHS->getOpcode() == Instruction::PtrToAddr)) {
         Constant *LHSOp0 = LHS->getOperand(0);
         Constant *RHSOp0 = RHS->getOperand(0);
 
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 3f1cc1e..27d8294 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -4098,15 +4098,8 @@ LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
   return wrap(unwrap(B)->CreateGlobalString(Str, Name));
 }
 
-LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
-  Value *P = unwrap(MemAccessInst);
-  if (LoadInst *LI = dyn_cast<LoadInst>(P))
-    return LI->isVolatile();
-  if (StoreInst *SI = dyn_cast<StoreInst>(P))
-    return SI->isVolatile();
-  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(P))
-    return AI->isVolatile();
-  return cast<AtomicCmpXchgInst>(P)->isVolatile();
+LLVMBool LLVMGetVolatile(LLVMValueRef Inst) {
+  return cast<Instruction>(unwrap(Inst))->isVolatile();
 }
 
 void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 614c3a9..15c0198 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -1002,6 +1003,18 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
   return C;
 }
 
+Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
+                                                     Value *False,
+                                                     StringRef PassName,
+                                                     const Twine &Name) {
+  Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
+  if (auto *SI = dyn_cast<SelectInst>(Ret)) {
+    setExplicitlyUnknownBranchWeightsIfProfiled(
+        *SI, *SI->getParent()->getParent(), PassName);
+  }
+  return Ret;
+}
+
 Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
                                    const Twine &Name, Instruction *MDFrom) {
   return CreateSelectFMF(C, True, False, {}, Name, MDFrom);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 88e7c44..9060a89 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2965,8 +2965,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
       // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 11: {
-      // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize/AddrSize
-      // and SrcSize==DstSize
+      // inttoptr, ptrtoint/ptrtoaddr -> integer cast
       if (!DL)
         return 0;
       unsigned MidSize = secondOp == Instruction::PtrToAddr
@@ -2974,10 +2973,15 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
                              : DL->getPointerTypeSizeInBits(MidTy);
       unsigned SrcSize = SrcTy->getScalarSizeInBits();
       unsigned DstSize = DstTy->getScalarSizeInBits();
-      // TODO: Could also produce zext or trunc here.
-      if (SrcSize <= MidSize && SrcSize == DstSize)
-        return Instruction::BitCast;
-      return 0;
+      // If the middle size is smaller than both source and destination,
+      // an additional masking operation would be required.
+      if (MidSize < SrcSize && MidSize < DstSize)
+        return 0;
+      if (DstSize < SrcSize)
+        return Instruction::Trunc;
+      if (DstSize > SrcSize)
+        return Instruction::ZExt;
+      return Instruction::BitCast;
     }
     case 12:
       // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c9ff86b..c79a950 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -893,7 +893,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
       if (GV.hasInitializer()) {
         const Constant *Init = GV.getInitializer();
         const ConstantArray *InitArray = dyn_cast<ConstantArray>(Init);
-        Check(InitArray, "wrong initalizer for intrinsic global variable",
+        Check(InitArray, "wrong initializer for intrinsic global variable",
               Init);
         for (Value *Op : InitArray->operands()) {
           Value *V = Op->stripPointerCasts();
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index 6b65720..5ab1def 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -136,6 +136,13 @@ struct DebugCounterOwner : DebugCounter {
       cl::location(this->ShouldPrintCounter),
       cl::init(false),
       cl::desc("Print out debug counter info after all counters accumulated")};
+  cl::opt<bool, true> PrintDebugCounterQueries{
+      "print-debug-counter-queries",
+      cl::Hidden,
+      cl::Optional,
+      cl::location(this->ShouldPrintCounterQueries),
+      cl::init(false),
+      cl::desc("Print out each query of an enabled debug counter")};
   cl::opt<bool, true> BreakOnLastCount{
       "debug-counter-break-on-last",
       cl::Hidden,
@@ -221,31 +228,40 @@ void DebugCounter::print(raw_ostream &OS) const {
   }
 }
 
+bool DebugCounter::handleCounterIncrement(CounterInfo &Info) {
+  int64_t CurrCount = Info.Count++;
+  uint64_t CurrIdx = Info.CurrChunkIdx;
+
+  if (Info.Chunks.empty())
+    return true;
+  if (CurrIdx >= Info.Chunks.size())
+    return false;
+
+  bool Res = Info.Chunks[CurrIdx].contains(CurrCount);
+  if (BreakOnLast && CurrIdx == (Info.Chunks.size() - 1) &&
+      CurrCount == Info.Chunks[CurrIdx].End) {
+    LLVM_BUILTIN_DEBUGTRAP;
+  }
+  if (CurrCount > Info.Chunks[CurrIdx].End) {
+    Info.CurrChunkIdx++;
+
+    /// Handle consecutive blocks.
+    if (Info.CurrChunkIdx < Info.Chunks.size() &&
+        CurrCount == Info.Chunks[Info.CurrChunkIdx].Begin)
+      return true;
+  }
+  return Res;
+}
+
 bool DebugCounter::shouldExecuteImpl(unsigned CounterName) {
   auto &Us = instance();
   auto Result = Us.Counters.find(CounterName);
   if (Result != Us.Counters.end()) {
     auto &CounterInfo = Result->second;
-    int64_t CurrCount = CounterInfo.Count++;
-    uint64_t CurrIdx = CounterInfo.CurrChunkIdx;
-
-    if (CounterInfo.Chunks.empty())
-      return true;
-    if (CurrIdx >= CounterInfo.Chunks.size())
-      return false;
-
-    bool Res = CounterInfo.Chunks[CurrIdx].contains(CurrCount);
-    if (Us.BreakOnLast && CurrIdx == (CounterInfo.Chunks.size() - 1) &&
-        CurrCount == CounterInfo.Chunks[CurrIdx].End) {
-      LLVM_BUILTIN_DEBUGTRAP;
-    }
-    if (CurrCount > CounterInfo.Chunks[CurrIdx].End) {
-      CounterInfo.CurrChunkIdx++;
-
-      /// Handle consecutive blocks.
-      if (CounterInfo.CurrChunkIdx < CounterInfo.Chunks.size() &&
-          CurrCount == CounterInfo.Chunks[CounterInfo.CurrChunkIdx].Begin)
-        return true;
+    bool Res = Us.handleCounterIncrement(CounterInfo);
+    if (Us.ShouldPrintCounterQueries && CounterInfo.IsSet) {
+      dbgs() << "DebugCounter " << Us.RegisteredCounters[CounterName] << "="
+             << (CounterInfo.Count - 1) << (Res ? " execute" : " skip") << "\n";
     }
     return Res;
   }
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 2ea3a24..afce803 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1363,9 +1363,12 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
   }
   case LISTSPLAT: {
     const auto *Value = dyn_cast<TypedInit>(LHS);
-    const auto *Size = dyn_cast<IntInit>(RHS);
-    if (Value && Size) {
-      SmallVector<const Init *, 8> Args(Size->getValue(), Value);
+    const auto *Count = dyn_cast<IntInit>(RHS);
+    if (Value && Count) {
+      if (Count->getValue() < 0)
+        PrintFatalError(Twine("!listsplat count ") + Count->getAsString() +
+                        " is negative");
+      SmallVector<const Init *, 8> Args(Count->getValue(), Value);
       return ListInit::get(Args, Value->getType());
     }
     break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6965116..9926a4d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26196,9 +26196,10 @@ static SDValue performFlagSettingCombine(SDNode *N,
     return DCI.CombineTo(N, Res, SDValue(N, 1));
   }
 
-  // Combine identical generic nodes into this node, re-using the result.
+  // Combine equivalent generic nodes into this node, re-using the result.
   if (SDNode *Generic = DCI.DAG.getNodeIfExists(
-          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
+          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
+          /*AllowCommute=*/true))
     DCI.CombineTo(Generic, SDValue(N, 0));
 
   return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1..5700468 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
             TII->isTRANS(MI)))
-    Result = true;
+    Result = !MI.mayLoadOrStore();
 
   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
-           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
-    Result = true;
+           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+    // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+    // For our purposes, these shall not be classified as VALU as this results
+    // in unexpected behavior.
+    Result = !MI.mayLoadOrStore();
+  }
 
   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
            TII->isSALU(MI))
-    Result = true;
+    Result = !MI.mayLoadOrStore();
 
   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
            TII->isMFMAorWMMA(MI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a44af5f..1b559a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2833,8 +2833,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
     R = getMad(DAG, DL, VT, YH, CH, Mad1);
   }
 
-  const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
-                            (Flags.hasNoInfs() || Options.NoInfsFPMath);
+  const bool IsFiniteOnly =
+      (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
 
   // TODO: Check if known finite from source value.
   if (!IsFiniteOnly) {
@@ -3161,9 +3161,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
       DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
 
   R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
-  const auto &Options = getTargetMachine().Options;
 
-  if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
+  if (!Flags.hasNoInfs()) {
     SDValue OverflowCheckConst =
         DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
     SDValue Overflow =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee466ca..596a895 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3575,7 +3575,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
 
   const bool IsFiniteOnly =
       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
-      (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
+      MI.getFlag(MachineInstr::FmNoInfs);
 
   if (!IsFiniteOnly) {
     // Expand isfinite(x) => fabs(x) < inf
@@ -3864,9 +3864,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
 
   R = B.buildSelect(Ty, Underflow, Zero, R);
 
-  const auto &Options = MF.getTarget().Options;
-
-  if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
+  if (!(Flags & MachineInstr::FmNoInfs)) {
     auto OverflowCheckConst =
         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 71494be..4e11c4f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
 #include "GCNRegPressure.h"
 #include "AMDGPU.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 
 using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
 
 GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
                                            const LiveIntervals &LIS,
-                                           const MachineRegisterInfo &MRI) {
+                                           const MachineRegisterInfo &MRI,
+                                           GCNRegPressure::RegKind RegKind) {
   GCNRPTracker::LiveRegSet LiveRegs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     auto Reg = Register::index2VirtReg(I);
+    if (RegKind != GCNRegPressure::TOTAL_KINDS &&
+        GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
+      continue;
     if (!LIS.hasInterval(Reg))
       continue;
     auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
 
 #undef PFX
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
+                                               GCNRegPressure::RegKind Kind,
+                                               LiveIntervals &LIS,
+                                               const MachineLoopInfo *MLI) {
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  auto &OS = dbgs();
+  const char *RegName = GCNRegPressure::getName(Kind);
+
+  unsigned MaxNumRegs = 0;
+  const MachineInstr *MaxPressureMI = nullptr;
+  GCNUpwardRPTracker RPT(LIS);
+  for (const MachineBasicBlock &MBB : MF) {
+    RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
+    for (const MachineInstr &MI : reverse(MBB)) {
+      RPT.recede(MI);
+      unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
+      if (NumRegs > MaxNumRegs) {
+        MaxNumRegs = NumRegs;
+        MaxPressureMI = &MI;
+      }
+    }
+  }
+
+  SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
+
+  // Max pressure can occur at either the early-clobber or register slot.
+  // Choose the maximum liveset between both slots. This is ugly but this is
+  // diagnostic code.
+  SlotIndex ECSlot = MISlot.getRegSlot(true);
+  SlotIndex RSlot = MISlot.getRegSlot(false);
+  GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
+  GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
+  unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
+  unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
+  GCNRPTracker::LiveRegSet *LiveSet =
+      ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
+  SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
+  assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
+
+  // Split live registers into single-def and multi-def sets.
+  GCNRegPressure SDefPressure, MDefPressure;
+  SmallVector<Register, 16> SDefRegs, MDefRegs;
+  for (auto [Reg, LaneMask] : *LiveSet) {
+    assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
+    LiveInterval &LI = LIS.getInterval(Reg);
+    if (LI.getNumValNums() == 1 ||
+        (LI.hasSubRanges() &&
+         llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
+           return SR.getNumValNums() == 1;
+         }))) {
+      SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+      SDefRegs.push_back(Reg);
+    } else {
+      MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+      MDefRegs.push_back(Reg);
+    }
+  }
+  unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
+  unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
+  assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
+
+  auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) {
+    return Printable([&, MBB, SI](raw_ostream &OS) {
+      OS << SI << ':' << printMBBReference(*MBB);
+      if (MLI)
+        if (const MachineLoop *ML = MLI->getLoopFor(MBB))
+          OS << " (LoopHdr " << printMBBReference(*ML->getHeader())
+             << ", Depth " << ML->getLoopDepth() << ")";
+    });
+  };
+
+  auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
+    GCNRegPressure RegPressure;
+    RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
+    OS << "  " << printReg(Reg, TRI) << ':'
+       << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
+       << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
+       << RegName << "s)\n";
+
+    // Use std::map to sort def/uses by SlotIndex.
+    std::map<SlotIndex, const MachineInstr *> Instrs;
+    for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) {
+      Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
+    }
+
+    for (const auto &[SI, MI] : Instrs) {
+      OS << "    ";
+      if (MI->definesRegister(Reg, TRI))
+        OS << "def ";
+      if (MI->readsRegister(Reg, TRI))
+        OS << "use ";
+      OS << printLoc(MI->getParent(), SI) << ": " << *MI;
+    }
+  };
+
+  OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
+     << " ***\n";
+  OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
+     << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
+     << *MaxPressureMI;
+
+  OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
+     << RegName << "s):\n";
+
+  // Sort SDefRegs by number of uses (smallest first)
+  llvm::sort(SDefRegs, [&](Register A, Register B) {
+    return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) <
+           std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end());
+  });
+
+  for (const Register Reg : SDefRegs) {
+    PrintRegInfo(Reg, LiveSet->lookup(Reg));
+  }
+
+  OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
+     << RegName << "s):\n";
+  for (const Register Reg : MDefRegs) {
+    PrintRegInfo(Reg, LiveSet->lookup(Reg));
+  }
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 898d1ff..979a8b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -31,6 +31,12 @@ class SlotIndex;
 struct GCNRegPressure {
   enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
 
+  static constexpr const char *getName(RegKind Kind) {
+    const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
+    assert(Kind < TOTAL_KINDS);
+    return Names[Kind];
+  }
+
   GCNRegPressure() {
     clear();
   }
@@ -41,6 +47,11 @@ struct GCNRegPressure {
 
   void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
 
+  unsigned getNumRegs(RegKind Kind) const {
+    assert(Kind < TOTAL_KINDS);
+    return Value[Kind];
+  }
+
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR]; }
   /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +149,12 @@ struct GCNRegPressure {
 
   void dump() const;
 
+  static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
+    return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
+  }
+
 private:
   static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
 
@@ -294,8 +311,10 @@ public:
   }
 };
 
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
-                                     const MachineRegisterInfo &MRI);
+GCNRPTracker::LiveRegSet
+getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+            const MachineRegisterInfo &MRI,
+            GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
 
 ////////////////////////////////////////////////////////////////////////////////
 // GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
                             const MachineRegisterInfo &MRI,
                             LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
 
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
-                                     const MachineRegisterInfo &MRI);
-
 /// creates a map MachineInstr -> LiveRegSet
 /// R - range of iterators on instructions
 /// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ public:
   }
 };
 
+LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
+                                 GCNRegPressure::RegKind Kind,
+                                 LiveIntervals &LIS,
+                                 const MachineLoopInfo *MLI);
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc0810..58482ea 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers(
     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
     cl::init(false));
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define DUMP_MAX_REG_PRESSURE
+static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
+    "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
+    cl::desc("Print a list of live registers along with their def/uses at the "
+             "point of maximum register pressure before scheduling."),
+    cl::init(false));
+
+static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
+    "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
+    cl::desc("Print a list of live registers along with their def/uses at the "
+             "point of maximum register pressure after scheduling."),
+    cl::init(false));
+#endif
+
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
       RegionLiveOuts.buildLiveRegMap();
   }
 
+#ifdef DUMP_MAX_REG_PRESSURE
+  if (PrintMaxRPRegUsageBeforeScheduler) {
+    dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+    dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+    LIS->dump();
+  }
+#endif
+
   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
   while (S.advanceStage()) {
     auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
 
     Stage->finalizeGCNSchedStage();
   }
+
+#ifdef DUMP_MAX_REG_PRESSURE
+  if (PrintMaxRPRegUsageAfterScheduler) {
+    dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+    dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+    LIS->dump();
+  }
+#endif
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 64e34db..5f6d742 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -260,8 +260,12 @@ class NSAHelper {
 }
 
 class MIMGNSAHelper<int num_addrs,
-                    list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
-  : NSAHelper<> {
+                    list<RegisterOperand> addr_types_in=[]>
+    : NSAHelper<> {
+  list<RegisterOperand> addr_types =
+    !if(!empty(addr_types_in), !listsplat(VGPROp_32, num_addrs),
+        addr_types_in);
+
   list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i);
   let AddrIns = !dag(ins, addr_types, AddrAsmNames);
   let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
@@ -358,7 +362,7 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
 // Base class for all NSA MIMG instructions.
 // Note that 1-dword addresses always use non-NSA variants.
 class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
-                     list<RegisterClass> addr_types=[],
+                     list<RegisterOperand> addr_types=[],
                      RegisterOperand LastAddrRC = VGPROp_32>
   : MIMG<outs, dns>, MIMGe_gfx11<op> {
   let SubtargetPredicate = isGFX11Only;
@@ -378,7 +382,7 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
 }
 
 class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
-                   list<RegisterClass> addr_types=[]>
+                   list<RegisterOperand> addr_types=[]>
   : VIMAGE<outs, dns>, VIMAGEe<op> {
   let SubtargetPredicate = isGFX12Plus;
   let AssemblerPredicate = isGFX12Plus;
@@ -1521,12 +1525,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
   int VAddrDwords = !srl(Size, 5);
 
   int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
-  RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
-  list<RegisterClass> GFX11PlusAddrTypes =
-     !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32],
-           isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64],
-           IsA16  : [node_ptr_type, VGPR_32, VReg_96, VReg_96],
-           true   : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+  RegisterOperand node_ptr_type = !if(Is64, VGPROp_64, VGPROp_32);
+  list<RegisterOperand> GFX11PlusAddrTypes =
+     !cond(isBVH8 : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_32],
+           isDual : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_64],
+           IsA16  : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96],
+           true   : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96, VGPROp_96]);
 }
 
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC>
@@ -1552,7 +1556,7 @@ class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC>
 }
 
 class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
-                                  list<RegisterClass> addr_types>
+                                  list<RegisterOperand> addr_types>
     : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
                      addr_types> {
   let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
@@ -1561,7 +1565,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
 
 class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
                                 bit isDual, bit isBVH8,
-                                list<RegisterClass> addr_types>
+                                list<RegisterOperand> addr_types>
     : VIMAGE_gfx12<op.GFX12, !if(!or(isDual, isBVH8),
                                  (outs VReg_320:$vdata, VReg_96:$ray_origin_out,
                                        VReg_96:$ray_dir_out),
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index bef4868..7e7ee75 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -280,6 +280,10 @@ static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
 }
 
 void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
+  if (!Subtarget->hasTcgen05InstSupport())
+    report_fatal_error(
+        "tcgen05.ld is not supported on this architecture variant");
+
   SDLoc DL(N);
   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
@@ -2136,6 +2140,10 @@ static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
 }
 
 void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
+  if (!Subtarget->hasTcgen05InstSupport())
+    report_fatal_error(
+        "tcgen05.st is not supported on this architecture variant");
+
   SDLoc DL(N);
   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6c14cf0..dfde0cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -101,6 +101,22 @@ def PrmtMode : Operand<i32> {
 // NVPTX Instruction Predicate Definitions
 //===----------------------------------------------------------------------===//
 
+// Checks PTX version and family-specific and architecture-specific SM versions.
+// For example, sm_100{f/a} and any future variants in the same family will match
+// for any PTX version greater than or equal to `PTXVersion`.
+class PTXWithFamilySMs<int PTXVersion, list<int> SMVersions> :
+  Predicate<"Subtarget->hasPTXWithFamilySMs(" # PTXVersion # ", {" #
+            !interleave(SMVersions, ", ") # "})">;
+
+// Checks PTX version and architecture-specific SM versions.
+// For example, sm_100{a} will match for any PTX version
+// greater than or equal to `PTXVersion`.
+class PTXWithAccelSMs<int PTXVersion, list<int> SMVersions> :
+  Predicate<"Subtarget->hasPTXWithAccelSMs(" # PTXVersion # ", {" #
+            !interleave(SMVersions, ", ") # "})">;
+
+// Helper predicate to call a subtarget method.
+class callSubtarget<string SubtargetMethod> : Predicate<"Subtarget->" # SubtargetMethod # "()">;
 
 def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a8b854f..22cf3a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -5103,8 +5103,8 @@ let Predicates = [hasSM<90>, hasPTX<78>] in {
 def EXIT : NullaryInst<"exit", int_nvvm_exit>;
 
 // Tcgen05 intrinsics
-let isConvergent = true, Predicates = [hasTcgen05Instructions] in {
-
+let isConvergent = true in {
+let Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
 multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> {
   def "" : BasicNVPTXInst<(outs),
              (ins ADDR:$dst, B32:$ncols),
@@ -5156,15 +5156,6 @@ defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">;
 defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">;
 defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">;
 
-multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
-  def "" : BasicNVPTXInst<(outs),
-             (ins ADDR:$tmem_addr),
-             "tcgen05.shift.cta_group::" # num # ".down",
-             [(Intr addr:$tmem_addr)]>;
-}
-defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
-defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
-
 multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> {
   defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16");
   defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret;
@@ -5195,9 +5186,22 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in {
   defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">;
   defm TCGEN05_CP_32x128 # src_fmt   : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">;
 }
+} // Predicates
+
+let Predicates = [callSubtarget<"hasTcgen05ShiftSupport">] in {
+multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
+  def "" : BasicNVPTXInst<(outs),
+             (ins ADDR:$tmem_addr),
+             "tcgen05.shift.cta_group::" # num # ".down",
+             [(Intr addr:$tmem_addr)]>;
+}
+defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
+defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
+} // Predicates
+
 } // isConvergent
 
-let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in {
+let hasSideEffects = 1, Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
 
   def tcgen05_fence_before_thread_sync: NullaryInst<
     "tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>;
@@ -5231,8 +5235,7 @@ class TCGEN05_LDST_REGINFO<int Veclen> {
 //
 
 class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
-        NVPTXInst<(outs), (ins), "?", []>,
-        Requires<[hasTcgen05Instructions]> {
+        NVPTXInst<(outs), (ins), "?", []> {
 
   TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
                                 NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
@@ -5256,8 +5259,7 @@ class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
 //
 
 class TCGEN05_ST_INST<string Shape, int Num, bit Unpack> :
-        NVPTXInst<(outs), (ins), "?", []>,
-        Requires<[hasTcgen05Instructions]> {
+        NVPTXInst<(outs), (ins), "?", []> {
 
   TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
                                 NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index c548967..989be50 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -72,6 +72,40 @@ const SelectionDAGTargetInfo *NVPTXSubtarget::getSelectionDAGInfo() const {
   return TSInfo.get();
 }
 
+bool NVPTXSubtarget::hasPTXWithFamilySMs(unsigned PTXVersion,
+                                         ArrayRef<unsigned> SMVersions) const {
+  unsigned PTXVer = getPTXVersion();
+  if (!hasFamilySpecificFeatures() || PTXVer < PTXVersion)
+    return false;
+
+  unsigned SMVer = getSmVersion();
+  return llvm::any_of(SMVersions, [&](unsigned SM) {
+    // sm_101 is a different family, never group it with sm_10x.
+    if (SMVer == 101 || SM == 101)
+      return SMVer == SM &&
+             // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+             // supported.
+             !(PTXVer >= 90 && SMVer == 101);
+
+    return getSmFamilyVersion() == SM / 10 && SMVer >= SM;
+  });
+}
+
+bool NVPTXSubtarget::hasPTXWithAccelSMs(unsigned PTXVersion,
+                                        ArrayRef<unsigned> SMVersions) const {
+  unsigned PTXVer = getPTXVersion();
+  if (!hasArchAccelFeatures() || PTXVer < PTXVersion)
+    return false;
+
+  unsigned SMVer = getSmVersion();
+  return llvm::any_of(SMVersions, [&](unsigned SM) {
+    return SMVer == SM &&
+           // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+           // supported.
+           !(PTXVer >= 90 && SMVer == 101);
+  });
+}
+
 bool NVPTXSubtarget::allowFP16Math() const {
   return hasFP16Math() && NoF16Math == false;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e81c56b..194dbdc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -73,6 +73,18 @@ public:
 
   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
 
+  // Checks PTX version and family-specific and architecture-specific SM
+  // versions. For example, sm_100{f/a} and any future variants in the same
+  // family will match for any PTX version greater than or equal to
+  // `PTXVersion`.
+  bool hasPTXWithFamilySMs(unsigned PTXVersion,
+                           ArrayRef<unsigned> SMVersions) const;
+  // Checks PTX version and architecture-specific SM versions.
+  // For example, sm_100{a} will match for any PTX version greater than or equal
+  // to `PTXVersion`.
+  bool hasPTXWithAccelSMs(unsigned PTXVersion,
+                          ArrayRef<unsigned> SMVersions) const;
+
   bool has256BitVectorLoadStore(unsigned AS) const {
     return SmVersion >= 100 && PTXVersion >= 88 &&
            AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
@@ -127,6 +139,27 @@ public:
     return HasTcgen05 && PTXVersion >= MinPTXVersion;
   }
 
+  // Checks following instructions support:
+  // - tcgen05.ld/st
+  // - tcgen05.alloc/dealloc/relinquish
+  // - tcgen05.cp
+  // - tcgen05.fence/wait
+  // - tcgen05.commit
+  bool hasTcgen05InstSupport() const {
+    // sm_101 renamed to sm_110 in PTX 9.0
+    return hasPTXWithFamilySMs(90, {100, 110}) ||
+           hasPTXWithFamilySMs(88, {100, 101}) ||
+           hasPTXWithAccelSMs(86, {100, 101});
+  }
+
+  // Checks tcgen05.shift instruction support.
+  bool hasTcgen05ShiftSupport() const {
+    // sm_101 renamed to sm_110 in PTX 9.0
+    return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
+           hasPTXWithAccelSMs(88, {100, 101, 103}) ||
+           hasPTXWithAccelSMs(86, {100, 101});
+  }
+
   bool hasTcgen05MMAScaleInputDImm() const {
     return FullSmVersion == 1003 && PTXVersion >= 86;
   }
@@ -158,6 +191,7 @@ public:
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
+  unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
   // GPUs with "a" suffix have architecture-accelerated features that are
   // supported on the specified architecture only, hence such targets do not
   // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4b54231..8851a0f 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1659,6 +1659,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -1, (1 << 5) - 1,
         "immediate must be non-zero in the range");
+  case Match_InvalidXSfmmVType: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return generateXSfmmVTypeError(ErrorLoc);
+  }
   case Match_InvalidVTypeI: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return generateVTypeError(ErrorLoc);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 70b7c43..e75dfe3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -142,6 +142,22 @@ enum {
 
   ReadsPastVLShift = DestEEWShift + 2,
   ReadsPastVLMask = 1ULL << ReadsPastVLShift,
+
+  // 0 -> Don't care about altfmt bit in VTYPE.
+  // 1 -> Is not altfmt.
+  // 2 -> Is altfmt(BF16).
+  AltFmtTypeShift = ReadsPastVLShift + 1,
+  AltFmtTypeMask = 3ULL << AltFmtTypeShift,
+
+  // XSfmmbase
+  HasTWidenOpShift = AltFmtTypeShift + 2,
+  HasTWidenOpMask = 1ULL << HasTWidenOpShift,
+
+  HasTMOpShift = HasTWidenOpShift + 1,
+  HasTMOpMask = 1ULL << HasTMOpShift,
+
+  HasTKOpShift = HasTMOpShift + 1,
+  HasTKOpMask = 1ULL << HasTKOpShift,
 };
 
 // Helper functions to read TSFlags.
@@ -183,6 +199,11 @@ static inline bool hasRoundModeOp(uint64_t TSFlags) {
   return TSFlags & HasRoundModeOpMask;
 }
 
+enum class AltFmtType { DontCare, NotAltFmt, AltFmt };
+static inline AltFmtType getAltFmtType(uint64_t TSFlags) {
+  return static_cast<AltFmtType>((TSFlags & AltFmtTypeMask) >> AltFmtTypeShift);
+}
+
 /// \returns true if this instruction uses vxrm
 static inline bool usesVXRM(uint64_t TSFlags) { return TSFlags & UsesVXRMMask; }
 
@@ -204,11 +225,47 @@ static inline bool readsPastVL(uint64_t TSFlags) {
   return TSFlags & ReadsPastVLMask;
 }
 
+// XSfmmbase
+static inline bool hasTWidenOp(uint64_t TSFlags) {
+  return TSFlags & HasTWidenOpMask;
+}
+
+static inline bool hasTMOp(uint64_t TSFlags) { return TSFlags & HasTMOpMask; }
+
+static inline bool hasTKOp(uint64_t TSFlags) { return TSFlags & HasTKOpMask; }
+
+static inline unsigned getTNOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasVLOp(TSFlags));
+  unsigned Offset = 3;
+  if (hasTKOp(TSFlags))
+    Offset = 4;
+  return Desc.getNumOperands() - Offset;
+}
+
+static inline unsigned getTMOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasTMOp(TSFlags));
+  if (hasTKOp(TSFlags))
+    return Desc.getNumOperands() - 5;
+  // vtzero.t
+  return Desc.getNumOperands() - 4;
+}
+
+static inline unsigned getTKOpNum(const MCInstrDesc &Desc) {
+  [[maybe_unused]] const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasTKOp(TSFlags));
+  return Desc.getNumOperands() - 3;
+}
+
 static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   // This method is only called if we expect to have a VL operand, and all
   // instructions with VL also have SEW.
   assert(hasSEWOp(TSFlags) && hasVLOp(TSFlags));
+  // In Xsfmmbase, TN is an alias for VL, so here we use the same TSFlags bit.
+  if (hasTWidenOp(TSFlags))
+    return getTNOpNum(Desc);
   unsigned Offset = 2;
   if (hasVecPolicyOp(TSFlags))
     Offset = 3;
@@ -226,7 +283,7 @@ static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   assert(hasSEWOp(TSFlags));
   unsigned Offset = 1;
-  if (hasVecPolicyOp(TSFlags))
+  if (hasVecPolicyOp(TSFlags) || hasTWidenOp(TSFlags))
     Offset = 2;
   return Desc.getNumOperands() - Offset;
 }
@@ -243,6 +300,9 @@ static inline int getFRMOpNum(const MCInstrDesc &Desc) {
   if (!hasRoundModeOp(TSFlags) || usesVXRM(TSFlags))
     return -1;
 
+  if (hasTWidenOp(TSFlags) && hasTMOp(TSFlags))
+    return getTMOpNum(Desc) - 1;
+
   // The operand order
   // --------------------------------------
   // | n-1 (if any)   | n-2  | n-3 | n-4 |
@@ -385,7 +445,9 @@ enum OperandType : unsigned {
   OPERAND_SEW_MASK,
   // Vector rounding mode for VXRM or FRM.
   OPERAND_VEC_RM,
-  OPERAND_LAST_RISCV_IMM = OPERAND_VEC_RM,
+  // Vtype operand for XSfmm extension.
+  OPERAND_XSFMM_VTYPE,
+  OPERAND_LAST_RISCV_IMM = OPERAND_XSFMM_VTYPE,
   // Operand is either a register or uimm5, this is used by V extension pseudo
   // instructions to represent a value that be passed as AVL to either vsetvli
   // or vsetivli.
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index cf8d120..9ed3b97 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -168,10 +168,13 @@ struct DemandedFields {
   // If this is true, we demand that VTYPE is set to some legal state, i.e. that
   // vill is unset.
   bool VILL = false;
+  bool UseTWiden = false;
+  bool UseAltFmt = false;
 
   // Return true if any part of VTYPE was used
   bool usedVTYPE() const {
-    return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL;
+    return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL ||
+           UseTWiden || UseAltFmt;
   }
 
   // Return true if any property of VL was used
@@ -187,6 +190,8 @@ struct DemandedFields {
     TailPolicy = true;
     MaskPolicy = true;
     VILL = true;
+    UseTWiden = true;
+    UseAltFmt = true;
   }
 
   // Mark all VL properties as demanded
@@ -212,6 +217,8 @@ struct DemandedFields {
     TailPolicy |= B.TailPolicy;
     MaskPolicy |= B.MaskPolicy;
     VILL |= B.VILL;
+    UseAltFmt |= B.UseAltFmt;
+    UseTWiden |= B.UseTWiden;
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -258,7 +265,9 @@ struct DemandedFields {
     OS << "SEWLMULRatio=" << SEWLMULRatio << ", ";
     OS << "TailPolicy=" << TailPolicy << ", ";
     OS << "MaskPolicy=" << MaskPolicy << ", ";
-    OS << "VILL=" << VILL;
+    OS << "VILL=" << VILL << ", ";
+    OS << "UseAltFmt=" << UseAltFmt << ", ";
+    OS << "UseTWiden=" << UseTWiden;
     OS << "}";
   }
 #endif
@@ -328,6 +337,15 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
   if (Used.MaskPolicy && RISCVVType::isMaskAgnostic(CurVType) !=
                              RISCVVType::isMaskAgnostic(NewVType))
     return false;
+  if (Used.UseTWiden && (RISCVVType::hasXSfmmWiden(CurVType) !=
+                             RISCVVType::hasXSfmmWiden(NewVType) ||
+                         (RISCVVType::hasXSfmmWiden(CurVType) &&
+                          RISCVVType::getXSfmmWiden(CurVType) !=
+                              RISCVVType::getXSfmmWiden(NewVType))))
+    return false;
+  if (Used.UseAltFmt &&
+      RISCVVType::isAltFmt(CurVType) != RISCVVType::isAltFmt(NewVType))
+    return false;
   return true;
 }
 
@@ -479,6 +497,11 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     Res.TailPolicy = false;
   }
 
+  Res.UseAltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) !=
+                  RISCVII::AltFmtType::DontCare;
+  Res.UseTWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+                  RISCVInstrInfo::isXSfmmVectorConfigInstr(MI);
+
   return Res;
 }
 
@@ -510,6 +533,8 @@ class VSETVLIInfo {
   uint8_t TailAgnostic : 1;
   uint8_t MaskAgnostic : 1;
   uint8_t SEWLMULRatioOnly : 1;
+  uint8_t AltFmt : 1;
+  uint8_t TWiden : 3;
 
 public:
   VSETVLIInfo()
@@ -586,6 +611,8 @@ public:
   RISCVVType::VLMUL getVLMUL() const { return VLMul; }
   bool getTailAgnostic() const { return TailAgnostic; }
   bool getMaskAgnostic() const { return MaskAgnostic; }
+  bool getAltFmt() const { return AltFmt; }
+  unsigned getTWiden() const { return TWiden; }
 
   bool hasNonZeroAVL(const LiveIntervals *LIS) const {
     if (hasAVLImm())
@@ -647,21 +674,31 @@ public:
     SEW = RISCVVType::getSEW(VType);
     TailAgnostic = RISCVVType::isTailAgnostic(VType);
     MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
+    AltFmt = RISCVVType::isAltFmt(VType);
+    TWiden =
+        RISCVVType::hasXSfmmWiden(VType) ? RISCVVType::getXSfmmWiden(VType) : 0;
   }
-  void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA) {
+  void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA, bool Altfmt,
+                unsigned W) {
     assert(isValid() && !isUnknown() &&
            "Can't set VTYPE for uninitialized or unknown");
     VLMul = L;
     SEW = S;
     TailAgnostic = TA;
     MaskAgnostic = MA;
+    AltFmt = Altfmt;
+    TWiden = W;
   }
 
+  void setAltFmt(bool AF) { AltFmt = AF; }
+
   void setVLMul(RISCVVType::VLMUL VLMul) { this->VLMul = VLMul; }
 
   unsigned encodeVTYPE() const {
     assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
            "Can't encode VTYPE for uninitialized or unknown");
+    if (TWiden != 0)
+      return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt);
     return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
   }
 
@@ -674,9 +711,9 @@ public:
            "Can't compare VTYPE in unknown state");
     assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly &&
            "Can't compare when only LMUL/SEW ratio is valid.");
-    return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic) ==
+    return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden) ==
            std::tie(Other.VLMul, Other.SEW, Other.TailAgnostic,
-                    Other.MaskAgnostic);
+                    Other.MaskAgnostic, Other.AltFmt, Other.TWiden);
   }
 
   unsigned getSEWLMULRatio() const {
@@ -825,7 +862,9 @@ public:
        << "SEW=e" << (unsigned)SEW << ", "
        << "TailAgnostic=" << (bool)TailAgnostic << ", "
        << "MaskAgnostic=" << (bool)MaskAgnostic << ", "
-       << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << "}";
+       << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", "
+       << "TWiden=" << (unsigned)TWiden << ", "
+       << "AltFmt=" << (bool)AltFmt << "}";
   }
 #endif
 };
@@ -853,6 +892,11 @@ struct BlockData {
   BlockData() = default;
 };
 
+enum TKTMMode {
+  VSETTK = 0,
+  VSETTM = 1,
+};
+
 class RISCVInsertVSETVLI : public MachineFunctionPass {
   const RISCVSubtarget *ST;
   const TargetInstrInfo *TII;
@@ -908,6 +952,7 @@ private:
   VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
   VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
   void forwardVSETVLIAVL(VSETVLIInfo &Info) const;
+  bool insertVSETMTK(MachineBasicBlock &MBB, TKTMMode Mode) const;
 };
 
 } // end anonymous namespace
@@ -945,6 +990,18 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
   VSETVLIInfo NewInfo;
   if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
     NewInfo.setAVLImm(MI.getOperand(1).getImm());
+  } else if (RISCVInstrInfo::isXSfmmVectorConfigTNInstr(MI)) {
+    assert(MI.getOpcode() == RISCV::PseudoSF_VSETTNT ||
+           MI.getOpcode() == RISCV::PseudoSF_VSETTNTX0);
+    switch (MI.getOpcode()) {
+    case RISCV::PseudoSF_VSETTNTX0:
+      NewInfo.setAVLVLMAX();
+      break;
+    case RISCV::PseudoSF_VSETTNT:
+      Register ATNReg = MI.getOperand(1).getReg();
+      NewInfo.setAVLRegDef(getVNInfoFromReg(ATNReg, MI, LIS), ATNReg);
+      break;
+    }
   } else {
     assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
            MI.getOpcode() == RISCV::PseudoVSETVLIX0);
@@ -1005,11 +1062,34 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 
   RISCVVType::VLMUL VLMul = RISCVII::getLMul(TSFlags);
 
+  bool AltFmt = RISCVII::getAltFmtType(TSFlags) == RISCVII::AltFmtType::AltFmt;
+  InstrInfo.setAltFmt(AltFmt);
+
   unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
   // A Log2SEW of 0 is an operation on mask registers only.
   unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
   assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
 
+  if (RISCVII::hasTWidenOp(TSFlags)) {
+    const MachineOperand &TWidenOp =
+        MI.getOperand(MI.getNumExplicitOperands() - 1);
+    unsigned TWiden = TWidenOp.getImm();
+
+    InstrInfo.setAVLVLMAX();
+    if (RISCVII::hasVLOp(TSFlags)) {
+      const MachineOperand &TNOp =
+          MI.getOperand(RISCVII::getTNOpNum(MI.getDesc()));
+
+      if (TNOp.getReg().isVirtual())
+        InstrInfo.setAVLRegDef(getVNInfoFromReg(TNOp.getReg(), MI, LIS),
+                               TNOp.getReg());
+    }
+
+    InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden);
+
+    return InstrInfo;
+  }
+
   if (RISCVII::hasVLOp(TSFlags)) {
     const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
     if (VLOp.isImm()) {
@@ -1045,7 +1125,9 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
     assert(SEW == EEW && "Initial SEW doesn't match expected EEW");
   }
 #endif
-  InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+  // TODO: Propagate the twiden from previous vtype for potential reuse.
+  InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt,
+                     /*TWiden*/ 0);
 
   forwardVSETVLIAVL(InstrInfo);
 
@@ -1053,10 +1135,33 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 }
 
 void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator InsertPt, DebugLoc DL,
-                     const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
-
+                                       MachineBasicBlock::iterator InsertPt,
+                                       DebugLoc DL, const VSETVLIInfo &Info,
+                                       const VSETVLIInfo &PrevInfo) {
   ++NumInsertedVSETVL;
+
+  if (Info.getTWiden()) {
+    if (Info.hasAVLVLMAX()) {
+      Register DestReg = MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass);
+      auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNTX0))
+                    .addReg(DestReg, RegState::Define | RegState::Dead)
+                    .addReg(RISCV::X0, RegState::Kill)
+                    .addImm(Info.encodeVTYPE());
+      if (LIS) {
+        LIS->InsertMachineInstrInMaps(*MI);
+        LIS->createAndComputeVirtRegInterval(DestReg);
+      }
+    } else {
+      auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNT))
+                    .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+                    .addReg(Info.getAVLReg())
+                    .addImm(Info.encodeVTYPE());
+      if (LIS)
+        LIS->InsertMachineInstrInMaps(*MI);
+    }
+    return;
+  }
+
   if (PrevInfo.isValid() && !PrevInfo.isUnknown()) {
     // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
     // VLMAX.
@@ -1198,7 +1303,8 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
     // be coalesced into another vsetvli since we won't demand any fields.
     VSETVLIInfo NewInfo; // Need a new VSETVLIInfo to clear SEWLMULRatioOnly
     NewInfo.setAVLImm(1);
-    NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true);
+    NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true,
+                     /*AltFmt*/ false, /*W*/ 0);
     Info = NewInfo;
     return;
   }
@@ -1240,7 +1346,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
       (Demanded.TailPolicy ? IncomingInfo : Info).getTailAgnostic() ||
           IncomingInfo.getTailAgnostic(),
       (Demanded.MaskPolicy ? IncomingInfo : Info).getMaskAgnostic() ||
-          IncomingInfo.getMaskAgnostic());
+          IncomingInfo.getMaskAgnostic(),
+      (Demanded.UseAltFmt ? IncomingInfo : Info).getAltFmt(),
+      Demanded.UseTWiden ? IncomingInfo.getTWiden() : 0);
 
   // If we only knew the sew/lmul ratio previously, replace the VTYPE but keep
   // the AVL.
@@ -1293,7 +1401,8 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB,
 
     if (RISCVInstrInfo::isVectorConfigInstr(MI) ||
         RISCVII::hasSEWOp(MI.getDesc().TSFlags) ||
-        isVectorCopy(ST->getRegisterInfo(), MI))
+        isVectorCopy(ST->getRegisterInfo(), MI) ||
+        RISCVInstrInfo::isXSfmmVectorConfigInstr(MI))
       HadVectorOp = true;
 
     transferAfter(Info, MI);
@@ -1675,6 +1784,12 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   };
 
   for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+    // TODO: Support XSfmm.
+    if (RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+        RISCVInstrInfo::isXSfmmVectorConfigInstr(MI)) {
+      NextMI = nullptr;
+      continue;
+    }
 
     if (!RISCVInstrInfo::isVectorConfigInstr(MI)) {
       Used.doUnion(getDemanded(MI, ST));
@@ -1788,6 +1903,65 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
   }
 }
 
+bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB,
+                                       TKTMMode Mode) const {
+
+  bool Changed = false;
+  for (auto &MI : MBB) {
+    uint64_t TSFlags = MI.getDesc().TSFlags;
+    if (RISCVInstrInfo::isXSfmmVectorConfigTMTKInstr(MI) ||
+        !RISCVII::hasSEWOp(TSFlags) || !RISCVII::hasTWidenOp(TSFlags))
+      continue;
+
+    VSETVLIInfo CurrInfo = computeInfoForInstr(MI);
+
+    if (Mode == VSETTK && !RISCVII::hasTKOp(TSFlags))
+      continue;
+
+    if (Mode == VSETTM && !RISCVII::hasTMOp(TSFlags))
+      continue;
+
+    unsigned OpNum = 0;
+    unsigned Opcode = 0;
+    switch (Mode) {
+    case VSETTK:
+      OpNum = RISCVII::getTKOpNum(MI.getDesc());
+      Opcode = RISCV::PseudoSF_VSETTK;
+      break;
+    case VSETTM:
+      OpNum = RISCVII::getTMOpNum(MI.getDesc());
+      Opcode = RISCV::PseudoSF_VSETTM;
+      break;
+    }
+
+    assert(OpNum && Opcode && "Invalid OpNum or Opcode");
+
+    MachineOperand &Op = MI.getOperand(OpNum);
+
+    auto TmpMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opcode))
+                     .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+                     .addReg(Op.getReg())
+                     .addImm(Log2_32(CurrInfo.getSEW()))
+                     .addImm(Log2_32(CurrInfo.getTWiden()) + 1);
+
+    Changed = true;
+    Register Reg = Op.getReg();
+    Op.setReg(Register());
+    Op.setIsKill(false);
+    if (LIS) {
+      LIS->InsertMachineInstrInMaps(*TmpMI);
+      LiveInterval &LI = LIS->getInterval(Reg);
+
+      // Erase the AVL operand from the instruction.
+      LIS->shrinkToUses(&LI);
+      // TODO: Enable this once needVSETVLIPHI is supported.
+      // SmallVector<LiveInterval *> SplitLIs;
+      // LIS->splitSeparateComponents(LI, SplitLIs);
+    }
+  }
+  return Changed;
+}
+
 bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   // Skip if the vector extension is not enabled.
   ST = &MF.getSubtarget<RISCVSubtarget>();
@@ -1865,6 +2039,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     insertReadVL(MBB);
 
+  for (MachineBasicBlock &MBB : MF) {
+    insertVSETMTK(MBB, VSETTM);
+    insertVSETMTK(MBB, VSETTK);
+  }
+
   BlockInfo.clear();
   return HaveVectorOp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 2afd77a..5b06303 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -267,6 +267,22 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   // operands' VLs.
   bit ReadsPastVL = 0;
   let TSFlags{26} = ReadsPastVL;
+
+  // 0 -> Don't care about altfmt bit in VTYPE.
+  // 1 -> Is not altfmt.
+  // 2 -> Is altfmt(BF16).
+  bits<2> AltFmtType = 0;
+  let TSFlags{28-27} = AltFmtType;
+
+  // XSfmmbase
+  bit HasTWidenOp = 0;
+  let TSFlags{29} = HasTWidenOp;
+
+  bit HasTmOp = 0;
+  let TSFlags{30} = HasTmOp;
+
+  bit HasTkOp = 0;
+  let TSFlags{31} = HasTkOp;
 }
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 96e1078..ddb53a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3005,6 +3005,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           else
             Ok = RISCVFPRndMode::isValidRoundingMode(Imm);
           break;
+        case RISCVOp::OPERAND_XSFMM_VTYPE:
+          Ok = RISCVVType::isValidXSfmmVType(Imm);
+          break;
         }
         if (!Ok) {
           ErrInfo = "Invalid immediate";
@@ -3670,6 +3673,11 @@ std::string RISCVInstrInfo::createMIROperandComment(
     RISCVVType::printVType(Imm, OS);
     break;
   }
+  case RISCVOp::OPERAND_XSFMM_VTYPE: {
+    unsigned Imm = Op.getImm();
+    RISCVVType::printXSfmmVType(Imm, OS);
+    break;
+  }
   case RISCVOp::OPERAND_SEW:
   case RISCVOp::OPERAND_SEW_MASK: {
     unsigned Log2SEW = Op.getImm();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 298d35a..c1b23af 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -128,6 +128,9 @@ defvar TAIL_AGNOSTIC = 1;
 defvar TU_MU = 0;
 defvar TA_MU = 1;
 defvar TA_MA = 3;
+defvar DONT_CARE_ALTFMT = 0;
+defvar IS_NOT_ALTFMT = 1;
+defvar IS_ALTFMT = 2;
 
 //===----------------------------------------------------------------------===//
 // Utilities.
@@ -159,7 +162,8 @@ class PseudoToVInst<string PseudoInst> {
                         ["_M4", ""],
                         ["_M8", ""],
                         ["_SE", ""],
-                        ["_RM", ""]
+                        ["_RM", ""],
+                        ["_ALT", ""]
                        ];
   string VInst = !foldl(PseudoInst, AffixSubsts, Acc, AffixSubst,
                         !subst(AffixSubst[0], AffixSubst[1], Acc));
@@ -6396,7 +6400,7 @@ let Defs = [VXSAT] in {
 // 13. Vector Floating-Point Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 //===----------------------------------------------------------------------===//
 // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
@@ -6565,7 +6569,7 @@ defm PseudoVFNCVT_F_F      : VPseudoVNCVTD_W_RM;
 
 defm PseudoVFNCVT_ROD_F_F  : VPseudoVNCVTD_W;
 } // mayRaiseFPException = true
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 14. Vector Reduction Operations
@@ -6593,7 +6597,7 @@ defm PseudoVWREDSUM    : VPseudoVWRED_VS;
 }
 } // Predicates = [HasVInstructions]
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 //===----------------------------------------------------------------------===//
 // 14.3. Vector Single-Width Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
@@ -6612,7 +6616,7 @@ defm PseudoVFWREDUSUM  : VPseudoVFWRED_VS_RM;
 defm PseudoVFWREDOSUM  : VPseudoVFWREDO_VS_RM;
 }
 
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 15. Vector Mask Instructions
@@ -6703,7 +6707,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 // 16.2. Floating-Point Scalar Move Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach f = FPList in {
     let HasSEWOp = 1, BaseInstr = VFMV_F_S in
@@ -6718,7 +6722,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
       Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
   }
 }
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 16.3. Vector Slide Instructions
@@ -6730,10 +6734,10 @@ let Predicates = [HasVInstructions] in {
   defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX;
 } // Predicates = [HasVInstructions]
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
   defm PseudoVFSLIDE1UP  : VPseudoVSLD1_VF<"@earlyclobber $rd">;
   defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF;
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 16.4. Vector Register Gather Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 557d873..6a4119a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -438,8 +438,10 @@ let Predicates = [HasVendorXSfvcp] in {
   }
   foreach f = FPList in {
     foreach m = f.MxList in {
-    defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
-    defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+      let AltFmtType = IS_NOT_ALTFMT in {
+        defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
+        defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+      }
     }
   }
   foreach m = MxListW in {
@@ -449,7 +451,8 @@ let Predicates = [HasVendorXSfvcp] in {
   }
   foreach f = FPListW in {
     foreach m = f.MxList in
-    defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
+      let AltFmtType = IS_NOT_ALTFMT in
+        defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index a5ee701..5ad22e6b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -225,7 +225,7 @@ let Predicates = [HasVendorXSfmmbase] in {
   def SF_VSETTM : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00001,
                                   "sf.vsettm", "$rd, $rs1">;
   def SF_VSETTK : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00010,
-                                   "sf.vsettk", "$rd, $rs1">;
+                                  "sf.vsettk", "$rd, $rs1">;
   def SF_VTDISCARD : SFInstVtDiscard<"sf.vtdiscard">;
 
   def SF_VTMV_V_T : SFInstTileMoveOp<0b010000, (outs VR:$vd), (ins GPR:$rs1),
@@ -277,3 +277,144 @@ let Uses = [FRM], mayRaiseFPException = true in {
 } // Predicates = [HasVendorXSfmm32a8f]
 
 } // DecoderNamespace = "XSfvector"
+
+class VPseudoSF_VTileLoad
+    : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileStore
+    : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_V_T
+    : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                         ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_T_V
+    : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul<RegisterClass mtd_class>
+    : RISCVVPseudo<(outs),
+                   (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, AVL:$atm, AVL:$atn,
+                        AVL:$atk, ixlenimm:$sew, ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasTmOp = 1;
+  let HasVLOp = 1; // Tn
+  let HasTkOp = 1;
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul_FRM<RegisterClass mtd_class>
+    : RISCVVPseudo<(outs),
+                   (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, ixlenimm:$frm,
+                        AVL:$atm, AVL:$atn, AVL:$atk, ixlenimm:$sew,
+                        ixlenimm:$twiden), []> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasTmOp = 1;
+  let HasVLOp = 1; // Tn
+  let HasTkOp = 1;
+  let HasSEWOp = 1;
+  let HasRoundModeOp = 1;
+  let hasPostISelHook = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let Defs = [VL, VTYPE] in {
+  def PseudoSF_VSETTNT
+      : Pseudo<(outs GPR:$rd),
+               (ins GPRNoX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTNTX0
+      : Pseudo<(outs GPRNoX0:$rd),
+               (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTNTX0X0
+      : Pseudo<(outs GPRX0:$rd),
+               (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+
+let Defs = [VTYPE], Uses = [VTYPE], HasTWidenOp = 1, HasSEWOp = 1 in {
+  def PseudoSF_VSETTM
+      : Pseudo<(outs GPR:$rd),
+               (ins GPR:$rs1, ixlenimm:$log2sew, ixlenimm:$twiden), []>,
+        PseudoInstExpansion<(SF_VSETTM GPR:$rd, GPR:$rs1)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTK
+      : Pseudo<(outs GPR:$rd),
+               (ins GPR:$rs1, ixlenimm:$logwsew, ixlenimm:$twiden), []>,
+        PseudoInstExpansion<(SF_VSETTK GPR:$rd, GPR:$rs1)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+}
+
+foreach eew = [8, 16, 32, 64] in {
+  def PseudoSF_VLTE # eew : VPseudoSF_VTileLoad;
+  def PseudoSF_VSTE # eew : VPseudoSF_VTileStore;
+}
+
+def PseudoSF_VTMV_T_V : VPseudoSF_VTileMove_T_V;
+def PseudoSF_VTMV_V_T : VPseudoSF_VTileMove_V_T;
+
+foreach a = I8Encodes in
+  foreach b = I8Encodes in
+    def PseudoSF_MM_ # !toupper(a.Name) # _ # !toupper(b.Name)
+        : VPseudoSF_MatMul<TRM4>;
+
+let AltFmtType = IS_NOT_ALTFMT in
+  def PseudoSF_MM_F_F : VPseudoSF_MatMul_FRM<TRM2>;
+let AltFmtType = IS_ALTFMT in
+  def PseudoSF_MM_F_F_ALT : VPseudoSF_MatMul_FRM<TRM2>;
+
+foreach e1 = [5, 4] in
+  foreach e2 = [5, 4] in
+    def PseudoSF_MM_E # e1 # M # !sub(7, e1) # _E # e2 # M # !sub(7, e2)
+        : VPseudoSF_MatMul_FRM<TRM4>;
+
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+  let HasVLOp = 1, HasTmOp = 1, HasTWidenOp = 1, HasSEWOp = 1 in
+    def PseudoSF_VTZERO_T
+        : RISCVVPseudo<(outs),
+                       (ins TR:$rd, AVL:$atm, AVL:$atn, ixlenimm:$sew,
+                            ixlenimm:$twiden)>;
+  def PseudoSF_VTDISCARD : RISCVVPseudo<(outs), (ins), []>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 3658817..dcae977 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -78,7 +78,41 @@ def isVectorConfigInstr
                        PseudoVSETVLI,
                        PseudoVSETVLIX0,
                        PseudoVSETVLIX0X0,
-                       PseudoVSETIVLI
+                       PseudoVSETIVLI,
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0
+                     ]>>>;
+
+// Returns true if this is a PseudoSF_VSETTNT* instructions.
+def isXSfmmVectorConfigTNInstr
+    : TIIPredicate<"isXSfmmVectorConfigTNInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0
+                     ]>>>;
+
+// Returns true if this is PseudoSF_VSETTM or PseudoSF_VSETTK.
+def isXSfmmVectorConfigTMTKInstr
+    : TIIPredicate<"isXSfmmVectorConfigTMTKInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTM,
+                       PseudoSF_VSETTK
+                     ]>>>;
+
+// Returns true if this is a XSfmm vector configuration instruction.
+def isXSfmmVectorConfigInstr
+    : TIIPredicate<"isXSfmmVectorConfigInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0,
+                       PseudoSF_VSETTM,
+                       PseudoSF_VSETTK
                      ]>>>;
 
 // Return true if this is 'vsetvli x0, x0, vtype' which preserves
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 40b6416..e9f43b9 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -178,6 +178,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // Shadow stack pointer.
   markSuperRegs(Reserved, RISCV::SSP);
 
+  // XSfmmbase
+  for (MCPhysReg Reg = RISCV::T0; Reg <= RISCV::T15; Reg++)
+    markSuperRegs(Reserved, Reg);
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6472334..47c24fc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -317,6 +317,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
     }
 
+    if (Subtarget->hasFP16()) {
+      setOperationAction(ISD::FMA, MVT::v8f16, Legal);
+    }
+
+    if (Subtarget->hasRelaxedSIMD()) {
+      setOperationAction(ISD::FMULADD, MVT::v4f32, Legal);
+      setOperationAction(ISD::FMULADD, MVT::v2f64, Legal);
+    }
+
     // Partial MLA reductions.
     for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) {
       setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal);
@@ -1120,6 +1129,18 @@ WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd(
+    const MachineFunction &MF, EVT VT) const {
+  if (!Subtarget->hasFP16() || !VT.isVector())
+    return false;
+
+  EVT ScalarVT = VT.getScalarType();
+  if (!ScalarVT.isSimple())
+    return false;
+
+  return ScalarVT.getSimpleVT().SimpleTy == MVT::f16;
+}
+
 bool WebAssemblyTargetLowering::shouldSimplifyDemandedVectorElts(
     SDValue Op, const TargetLoweringOpt &TLO) const {
   // ISel process runs DAGCombiner after legalization; this step is called
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b33a853..472ec67 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -81,6 +81,8 @@ private:
 
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(MVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT VT) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 49af78b..0f6e1ca 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1213,6 +1213,27 @@ defm EXTMUL_LOW_U :
 defm EXTMUL_HIGH_U :
   SIMDExtBinary<I64x2, extmul_high_u, "extmul_high_i32x4_u", 0xdf>;
 
+// Pattern for i32x4.dot_i16x8_s
+def : Pat<
+  (v4i32 (add
+    (wasm_shuffle
+      (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+      (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+      (i32 0), (i32 1), (i32 2), (i32 3),
+      (i32 8), (i32 9), (i32 10), (i32 11),
+      (i32 16), (i32 17), (i32 18), (i32 19),
+      (i32 24), (i32 25), (i32 26), (i32 27)),
+    (wasm_shuffle
+      (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+      (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+      (i32 4), (i32 5), (i32 6), (i32 7),
+      (i32 12), (i32 13), (i32 14), (i32 15),
+      (i32 20), (i32 21), (i32 22), (i32 23),
+      (i32 28), (i32 29), (i32 30), (i32 31)))
+  ),
+  (v4i32 (DOT v8i16:$lhs, v8i16:$rhs))
+>;
+
 //===----------------------------------------------------------------------===//
 // Floating-point unary arithmetic
 //===----------------------------------------------------------------------===//
@@ -1626,7 +1647,8 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero,
 // Relaxed (Negative) Multiply-Add  (madd/nmadd)
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> {
+multiclass RELAXED_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+                            list<Predicate> reqs> {
   defm MADD_#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
            [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd
@@ -1640,16 +1662,46 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
            vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
            vec.prefix#".relaxed_nmadd", simdopS, reqs>;
 
-  def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
-             (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+  def : Pat<(fadd_contract (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b)), (vec.vt V128:$c)),
+            (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+  def : Pat<(fmuladd (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)),
+             (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
 
-  def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
-             (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+  def : Pat<(fsub_contract (vec.vt V128:$c), (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b))),
+            (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+  def : Pat<(fmuladd (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)),
+             (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
 }
 
-defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+defm "" : RELAXED_SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
+defm "" : RELAXED_SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
+
+//===----------------------------------------------------------------------===//
+// FP16 (Negative) Multiply-Add  (madd/nmadd)
+//===----------------------------------------------------------------------===//
+
+multiclass HALF_PRECISION_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+                                   list<Predicate> reqs> {
+  defm MADD_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+           [(set (vec.vt V128:$dst), (fma
+             (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".madd\t$dst, $a, $b, $c",
+           vec.prefix#".madd", simdopA, reqs>;
+  defm NMADD_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+           [(set (vec.vt V128:$dst), (fma
+             (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".nmadd\t$dst, $a, $b, $c",
+           vec.prefix#".nmadd", simdopS, reqs>;
+}
+defm "" : HALF_PRECISION_SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+
+// TODO: I think separate intrinsics should be introduced for these FP16 operations.
+def : Pat<(v8f16 (int_wasm_relaxed_madd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+          (MADD_F16x8 V128:$a, V128:$b, V128:$c)>;
+def : Pat<(v8f16 (int_wasm_relaxed_nmadd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+          (NMADD_F16x8 V128:$a, V128:$b, V128:$c)>;
 
 //===----------------------------------------------------------------------===//
 // Laneselect
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index acf8e4c..5ea63a9 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -228,6 +228,10 @@ void printVType(unsigned VType, raw_ostream &OS) {
     OS << ", mu";
 }
 
+void printXSfmmVType(unsigned VType, raw_ostream &OS) {
+  OS << "e" << getSEW(VType) << ", w" << getXSfmmWiden(VType);
+}
+
 unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul) {
   unsigned LMul;
   bool Fractional;
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h
index 26ec4f3..e05fe28 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCloner.h
+++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h
@@ -1,3 +1,4 @@
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,9 +20,7 @@
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
 
 enum class CloneKind {
   /// The shared resume function for a switch lowering.
@@ -149,8 +148,6 @@ public:
   }
 };
 
-} // end namespace coro
-
-} // end namespace llvm
+} // end namespace llvm::coro
 
 #endif // LLVM_LIB_TRANSFORMS_COROUTINES_COROCLONER_H
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 471b9eb..cdb5852 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -38,7 +38,7 @@ public:
         AnyResumeFnPtrTy(PointerType::getUnqual(Context)) {}
   void lowerEarlyIntrinsics(Function &F);
 };
-}
+} // namespace
 
 // Replace a direct call to coro.resume or coro.destroy with an indirect call to
 // an address returned by coro.subfn.addr intrinsic. This is done so that
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 52f4ffe..cc47a55 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -16,11 +16,7 @@
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include "llvm/Transforms/Coroutines/CoroShape.h"
 
-namespace llvm {
-
-class CallGraph;
-
-namespace coro {
+namespace llvm::coro {
 
 bool isSuspendBlock(BasicBlock *BB);
 bool declaresAnyIntrinsic(const Module &M);
@@ -61,7 +57,6 @@ void normalizeCoroutine(Function &F, coro::Shape &Shape,
 CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
                              TargetTransformInfo &TTI,
                              ArrayRef<Value *> Arguments, IRBuilder<> &);
-} // End namespace coro.
-} // End namespace llvm
+} // End namespace llvm::coro
 
 #endif
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index 6aaabca..f2444da 100644
--- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -137,8 +137,7 @@ struct RematGraph {
 
 } // namespace
 
-namespace llvm {
-template <> struct GraphTraits<RematGraph *> {
+template <> struct llvm::GraphTraits<RematGraph *> {
   using NodeRef = RematGraph::RematNode *;
   using ChildIteratorType = RematGraph::RematNode **;
 
@@ -149,8 +148,6 @@ template <> struct GraphTraits<RematGraph *> {
   static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
 };
 
-} // end namespace llvm
-
 // For each instruction identified as materializable across the suspend point,
 // and its associated DAG of other rematerializable instructions,
 // recreate the DAG of instructions after the suspend point.
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index e474c07..81fe0c9 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -16,11 +16,8 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
-namespace llvm {
-
-namespace coro {
-
-namespace {
+using namespace llvm;
+using namespace llvm::coro;
 
 typedef SmallPtrSet<BasicBlock *, 8> VisitedBlocksSet;
 
@@ -71,7 +68,7 @@ static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
 /// This happens during the all-instructions iteration, so it must not
 /// delete the call.
 static Instruction *
-lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const coro::Shape &Shape,
+lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const Shape &Shape,
                     SmallVectorImpl<Instruction *> &DeadInsts) {
   IRBuilder<> Builder(AI);
   auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
@@ -450,10 +447,8 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape,
                        Visitor.getMayWriteBeforeCoroBegin());
 }
 
-} // namespace
-
-void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
-                           const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromArgs(SpillInfo &Spills, Function &F,
+                                 const SuspendCrossingInfo &Checker) {
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
     for (User *U : A.users())
@@ -461,7 +456,7 @@ void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
         Spills[&A].push_back(cast<Instruction>(U));
 }
 
-void collectSpillsAndAllocasFromInsts(
+void coro::collectSpillsAndAllocasFromInsts(
     SpillInfo &Spills, SmallVector<AllocaInfo, 8> &Allocas,
     SmallVector<Instruction *, 4> &DeadInstructions,
     SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
@@ -516,8 +511,8 @@ void collectSpillsAndAllocasFromInsts(
   }
 }
 
-void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
-                              const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
+                                    const SuspendCrossingInfo &Checker) {
   // We don't want the layout of coroutine frame to be affected
   // by debug information. So we only choose to salvage dbg.values for
   // whose value is already in the frame.
@@ -535,10 +530,9 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
 
 /// Async and Retcon{Once} conventions assume that all spill uses can be sunk
 /// after the coro.begin intrinsic.
-void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
-                                 CoroBeginInst *CoroBegin,
-                                 coro::SpillInfo &Spills,
-                                 SmallVectorImpl<coro::AllocaInfo> &Allocas) {
+void coro::sinkSpillUsesAfterCoroBegin(
+    const DominatorTree &Dom, CoroBeginInst *CoroBegin, coro::SpillInfo &Spills,
+    SmallVectorImpl<coro::AllocaInfo> &Allocas) {
   SmallSetVector<Instruction *, 32> ToMove;
   SmallVector<Instruction *, 32> Worklist;
 
@@ -582,8 +576,9 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
     Inst->moveBefore(InsertPt->getIterator());
 }
 
-BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
-                                         const DominatorTree &DT) {
+BasicBlock::iterator coro::getSpillInsertionPt(const coro::Shape &Shape,
+                                               Value *Def,
+                                               const DominatorTree &DT) {
   BasicBlock::iterator InsertPt;
   if (auto *Arg = dyn_cast<Argument>(Def)) {
     // For arguments, we will place the store instruction right after
@@ -625,7 +620,3 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
 
   return InsertPt;
 }
-
-} // End namespace coro.
-
-} // End namespace llvm.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7071876..943c223 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -471,7 +471,6 @@ private:
   Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
                                 unsigned Depth = 0);
 
-public:
   /// Create `select C, S1, S2`. Use only when the profile cannot be calculated
   /// from existing profile metadata: if the Function has profiles, this will
   /// set the profile of this select to "unknown".
@@ -484,6 +483,7 @@ public:
     return Sel;
   }
 
+public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
   void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 63e24a0..a330bb7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -110,8 +110,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I,
                               ShrAmt->getName() + ".z");
   // There is no existing !prof metadata we can derive the !prof metadata for
   // this select.
-  Value *Select = IC.createSelectInstWithUnknownProfile(ShrAmtZ, Lower, Upper);
-  IC.Builder.Insert(Select);
+  Value *Select = IC.Builder.CreateSelectWithUnknownProfile(ShrAmtZ, Lower,
+                                                            Upper, DEBUG_TYPE);
   Select->takeName(I);
   return Select;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 82ac903..3f11cae 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1690,6 +1690,11 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
 //    2) (fp_binop ({s|u}itofp x), FpC)
 //        -> ({s|u}itofp (int_binop x, (fpto{s|u}i FpC)))
 Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
+  // Don't perform the fold on vectors, as the integer operation may be much
+  // more expensive than the float operation in that case.
+  if (BO.getType()->isVectorTy())
+    return nullptr;
+
   std::array<Value *, 2> IntOps = {nullptr, nullptr};
   Constant *Op1FpC = nullptr;
   // Check for:
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index c86092b..a6ec6c1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/StaticDataProfileInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
@@ -194,6 +195,30 @@ static bool isAllocationWithHotColdVariant(const Function *Callee,
   }
 }
 
+static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
+                                             AnnotationKind Kind) {
+  assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK &&
+         "Should not handle AnnotationOK here");
+  SmallString<32> Reason;
+  switch (Kind) {
+  case llvm::memprof::AnnotationKind::ExplicitSection:
+    ++NumOfMemProfExplicitSectionGlobalVars;
+    Reason.append("explicit section name");
+    break;
+  case llvm::memprof::AnnotationKind::DeclForLinker:
+    Reason.append("linker declaration");
+    break;
+  case llvm::memprof::AnnotationKind::ReservedName:
+    Reason.append("name starts with `llvm.`");
+    break;
+  default:
+    llvm_unreachable("Unexpected annotation kind");
+  }
+  LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
+                    << Reason << ".\n");
+  return;
+}
+
 struct AllocMatchInfo {
   uint64_t TotalSize = 0;
   AllocationType AllocType = AllocationType::None;
@@ -775,29 +800,13 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
   return PreservedAnalyses::none();
 }
 
-// Returns true iff the global variable has custom section either by
-// __attribute__((section("name")))
-// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
-// or #pragma clang section directives
-// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
-static bool hasExplicitSectionName(const GlobalVariable &GVar) {
-  if (GVar.hasSection())
-    return true;
-
-  auto Attrs = GVar.getAttributes();
-  if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
-      Attrs.hasAttribute("relro-section") ||
-      Attrs.hasAttribute("rodata-section"))
-    return true;
-  return false;
-}
-
 bool MemProfUsePass::annotateGlobalVariables(
     Module &M, const memprof::DataAccessProfData *DataAccessProf) {
   if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
     return false;
 
   if (!DataAccessProf) {
+    M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 0U);
     M.getContext().diagnose(DiagnosticInfoPGOProfile(
         MemoryProfileFileName.data(),
         StringRef("Data access profiles not found in memprof. Ignore "
@@ -805,6 +814,7 @@ bool MemProfUsePass::annotateGlobalVariables(
         DS_Warning));
     return false;
   }
+  M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 1U);
 
   bool Changed = false;
   // Iterate all global variables in the module and annotate them based on
@@ -815,13 +825,9 @@ bool MemProfUsePass::annotateGlobalVariables(
   for (GlobalVariable &GVar : M.globals()) {
     assert(!GVar.getSectionPrefix().has_value() &&
            "GVar shouldn't have section prefix yet");
-    if (GVar.isDeclarationForLinker())
-      continue;
-
-    if (hasExplicitSectionName(GVar)) {
-      ++NumOfMemProfExplicitSectionGlobalVars;
-      LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName()
-                        << " has explicit section name. Skip annotating.\n");
+    auto Kind = llvm::memprof::getAnnotationKind(GVar);
+    if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) {
+      HandleUnsupportedAnnotationKinds(GVar, Kind);
       continue;
     }
 
@@ -831,7 +837,6 @@ bool MemProfUsePass::annotateGlobalVariables(
     // TODO: Track string content hash in the profiles and compute it inside the
     // compiler to categeorize the hotness string literals.
     if (Name.starts_with(".str")) {
-
       LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n");
       continue;
     }
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 56e0569..7cae94eb 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1295,6 +1295,24 @@ public:
     return commonAlignment(InitialAlign, ElementSizeInBits / 8);
   }
 
+  IntegerType *getIndexType(Value *Ptr) const {
+    return cast<IntegerType>(DL.getIndexType(Ptr->getType()));
+  }
+
+  Value *getIndex(Value *Ptr, uint64_t V) const {
+    return ConstantInt::get(getIndexType(Ptr), V);
+  }
+
+  Value *castToIndexType(Value *Ptr, Value *V, IRBuilder<> &Builder) const {
+    assert(isa<IntegerType>(V->getType()) &&
+           "Attempted to cast non-integral type to integer index");
+    // In case the data layout's index type differs in width from the type of
+    // the value we're given, truncate or zero extend to the appropriate width.
+    // We zero extend here as indices are unsigned.
+    return Builder.CreateZExtOrTrunc(V, getIndexType(Ptr),
+                                     V->getName() + ".cast");
+  }
+
   /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
   /// vectors.
   MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
@@ -1304,6 +1322,7 @@ public:
     Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());
     Value *EltPtr = Ptr;
     MatrixTy Result;
+    Stride = castToIndexType(Ptr, Stride, Builder);
     for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
       Value *GEP = computeVectorAddr(
           EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),
@@ -1325,14 +1344,14 @@ public:
                       ShapeInfo ResultShape, Type *EltTy,
                       IRBuilder<> &Builder) {
     Value *Offset = Builder.CreateAdd(
-        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
 
     Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
     auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
                                                    ResultShape.NumColumns);
 
     return loadMatrix(TileTy, TileStart, Align,
-                      Builder.getInt64(MatrixShape.getStride()), IsVolatile,
+                      getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
                       ResultShape, Builder);
   }
 
@@ -1363,14 +1382,15 @@ public:
                    MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
                    Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
     Value *Offset = Builder.CreateAdd(
-        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
 
     Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
     auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
                                                    StoreVal.getNumColumns());
 
     storeMatrix(TileTy, StoreVal, TileStart, MAlign,
-                Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
+                getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
+                Builder);
   }
 
   /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
@@ -1380,6 +1400,7 @@ public:
                        IRBuilder<> &Builder) {
     auto *VType = cast<FixedVectorType>(Ty);
     Value *EltPtr = Ptr;
+    Stride = castToIndexType(Ptr, Stride, Builder);
     for (auto Vec : enumerate(StoreVal.vectors())) {
       Value *GEP = computeVectorAddr(
           EltPtr,
@@ -2011,18 +2032,17 @@ public:
             const unsigned TileM = std::min(M - K, unsigned(TileSize));
             MatrixTy A =
                 loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
-                           LShape, Builder.getInt64(I), Builder.getInt64(K),
+                           LShape, getIndex(APtr, I), getIndex(APtr, K),
                            {TileR, TileM}, EltType, Builder);
             MatrixTy B =
                 loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
-                           RShape, Builder.getInt64(K), Builder.getInt64(J),
+                           RShape, getIndex(BPtr, K), getIndex(BPtr, J),
                            {TileM, TileC}, EltType, Builder);
             emitMatrixMultiply(Res, A, B, Builder, true, false,
                                getFastMathFlags(MatMul));
           }
           storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
-                      Builder.getInt64(I), Builder.getInt64(J), EltType,
-                      Builder);
+                      getIndex(CPtr, I), getIndex(CPtr, J), EltType, Builder);
         }
     }
 
@@ -2254,15 +2274,14 @@ public:
   /// Lower load instructions.
   MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
                      IRBuilder<> &Builder) {
-    return LowerLoad(Inst, Ptr, Inst->getAlign(),
-                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
-                     Builder);
+    return LowerLoad(Inst, Ptr, Inst->getAlign(), getIndex(Ptr, SI.getStride()),
+                     Inst->isVolatile(), SI, Builder);
   }
 
   MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
                       Value *Ptr, IRBuilder<> &Builder) {
     return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
-                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                      getIndex(Ptr, SI.getStride()), Inst->isVolatile(), SI,
                       Builder);
   }
 
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index b187208..3ce569f 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -44,7 +44,7 @@ using namespace llvm;
 STATISTIC(RemappedAtomMax, "Highest global NextAtomGroup (after mapping)");
 
 void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
-  auto CurGroup = DL->getAtomGroup();
+  uint64_t CurGroup = DL->getAtomGroup();
   if (!CurGroup)
     return;
 
@@ -62,21 +62,20 @@ void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
   RemappedAtomMax = std::max<uint64_t>(NewGroup, RemappedAtomMax);
 }
 
-namespace {
-void collectDebugInfoFromInstructions(const Function &F,
-                                      DebugInfoFinder &DIFinder) {
+static void collectDebugInfoFromInstructions(const Function &F,
+                                             DebugInfoFinder &DIFinder) {
   const Module *M = F.getParent();
-  if (M) {
-    // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
-    for (const auto &I : instructions(F))
-      DIFinder.processInstruction(*M, I);
-  }
+  if (!M)
+    return;
+  // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
+  for (const Instruction &I : instructions(F))
+    DIFinder.processInstruction(*M, I);
 }
 
 // Create a predicate that matches the metadata that should be identity mapped
 // during function cloning.
-MetadataPredicate createIdentityMDPredicate(const Function &F,
-                                            CloneFunctionChangeType Changes) {
+static MetadataPredicate
+createIdentityMDPredicate(const Function &F, CloneFunctionChangeType Changes) {
   if (Changes >= CloneFunctionChangeType::DifferentModule)
     return [](const Metadata *MD) { return false; };
 
@@ -107,7 +106,6 @@ MetadataPredicate createIdentityMDPredicate(const Function &F,
     return false;
   };
 }
-} // namespace
 
 /// See comments in Cloning.h.
 BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
@@ -213,10 +211,9 @@ void llvm::CloneFunctionMetadataInto(Function &NewFunc, const Function &OldFunc,
                                      const MetadataPredicate *IdentityMD) {
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc.getAllMetadata(MDs);
-  for (auto MD : MDs) {
-    NewFunc.addMetadata(MD.first,
-                        *MapMetadata(MD.second, VMap, RemapFlag, TypeMapper,
-                                     Materializer, IdentityMD));
+  for (const auto &[Kind, MD] : MDs) {
+    NewFunc.addMetadata(Kind, *MapMetadata(MD, VMap, RemapFlag, TypeMapper,
+                                           Materializer, IdentityMD));
   }
 }
 
@@ -235,7 +232,6 @@ void llvm::CloneFunctionBodyInto(Function &NewFunc, const Function &OldFunc,
   // appropriate.  Note that we save BE this way in order to handle cloning of
   // recursive functions into themselves.
   for (const BasicBlock &BB : OldFunc) {
-
     // Create a new basic block and copy instructions into it!
     BasicBlock *CBB =
         CloneBasicBlock(&BB, VMap, NameSuffix, &NewFunc, CodeInfo);
@@ -321,7 +317,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
   // Cloning is always a Module level operation, since Metadata needs to be
   // cloned.
-  const auto RemapFlag = RF_None;
+  const RemapFlags RemapFlag = RF_None;
 
   CloneFunctionMetadataInto(*NewFunc, *OldFunc, VMap, RemapFlag, TypeMapper,
                             Materializer, &IdentityMD);
@@ -346,8 +342,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // visiting the metadata attached to global values, which would allow this
   // code to be deleted. Alternatively, perhaps give responsibility for this
   // update to CloneFunctionInto's callers.
-  auto *NewModule = NewFunc->getParent();
-  auto *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+  Module *NewModule = NewFunc->getParent();
+  NamedMDNode *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
   // Avoid multiple insertions of the same DICompileUnit to NMD.
   SmallPtrSet<const void *, 8> Visited(llvm::from_range, NMD->operands());
 
@@ -355,7 +351,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // the function (e.g. as instructions' scope).
   DebugInfoFinder DIFinder;
   collectDebugInfoFromInstructions(*OldFunc, DIFinder);
-  for (auto *Unit : DIFinder.compile_units()) {
+  for (DICompileUnit *Unit : DIFinder.compile_units()) {
     MDNode *MappedUnit =
         MapMetadata(Unit, VMap, RF_None, TypeMapper, Materializer);
     if (Visited.insert(MappedUnit).second)
@@ -821,17 +817,16 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
         --PredCount[Pred];
 
       // Figure out how many entries to remove from each PHI.
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        ++PredCount[PN->getIncomingBlock(i)];
+      for (BasicBlock *Pred : PN->blocks())
+        ++PredCount[Pred];
 
       // At this point, the excess predecessor entries are positive in the
       // map.  Loop over all of the PHIs and remove excess predecessor
       // entries.
       BasicBlock::iterator I = NewBB->begin();
       for (; (PN = dyn_cast<PHINode>(I)); ++I) {
-        for (const auto &PCI : PredCount) {
-          BasicBlock *Pred = PCI.first;
-          for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
+        for (const auto &[Pred, Count] : PredCount) {
+          for (unsigned _ : llvm::seq<unsigned>(Count))
             PN->removeIncomingValue(Pred, false);
         }
       }
@@ -866,8 +861,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // As phi-nodes have been now remapped, allow incremental simplification of
   // newly-cloned instructions.
   const DataLayout &DL = NewFunc->getDataLayout();
-  for (const auto &BB : *OldFunc) {
-    for (const auto &I : BB) {
+  for (const BasicBlock &BB : *OldFunc) {
+    for (const Instruction &I : BB) {
       auto *NewI = dyn_cast_or_null<Instruction>(VMap.lookup(&I));
       if (!NewI)
         continue;
@@ -997,8 +992,8 @@ void llvm::CloneAndPruneFunctionInto(
 void llvm::remapInstructionsInBlocks(ArrayRef<BasicBlock *> Blocks,
                                      ValueToValueMapTy &VMap) {
   // Rewrite the code to refer to itself.
-  for (auto *BB : Blocks) {
-    for (auto &Inst : *BB) {
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &Inst : *BB) {
       RemapDbgRecordRange(Inst.getModule(), Inst.getDbgRecordRange(), VMap,
                           RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       RemapInstruction(&Inst, VMap,
@@ -1151,9 +1146,9 @@ void llvm::cloneNoAliasScopes(ArrayRef<MDNode *> NoAliasDeclScopes,
                               StringRef Ext, LLVMContext &Context) {
   MDBuilder MDB(Context);
 
-  for (auto *ScopeList : NoAliasDeclScopes) {
-    for (const auto &MDOperand : ScopeList->operands()) {
-      if (MDNode *MD = dyn_cast<MDNode>(MDOperand)) {
+  for (MDNode *ScopeList : NoAliasDeclScopes) {
+    for (const MDOperand &MDOp : ScopeList->operands()) {
+      if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
         AliasScopeNode SNANode(MD);
 
         std::string Name;
@@ -1177,7 +1172,7 @@ void llvm::adaptNoAliasScopes(Instruction *I,
   auto CloneScopeList = [&](const MDNode *ScopeList) -> MDNode * {
     bool NeedsReplacement = false;
     SmallVector<Metadata *, 8> NewScopeList;
-    for (const auto &MDOp : ScopeList->operands()) {
+    for (const MDOperand &MDOp : ScopeList->operands()) {
       if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
         if (auto *NewMD = ClonedScopes.lookup(MD)) {
           NewScopeList.push_back(NewMD);
@@ -1193,12 +1188,12 @@ void llvm::adaptNoAliasScopes(Instruction *I,
   };
 
   if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
-    if (auto *NewScopeList = CloneScopeList(Decl->getScopeList()))
+    if (MDNode *NewScopeList = CloneScopeList(Decl->getScopeList()))
       Decl->setScopeList(NewScopeList);
 
   auto replaceWhenNeeded = [&](unsigned MD_ID) {
     if (const MDNode *CSNoAlias = I->getMetadata(MD_ID))
-      if (auto *NewScopeList = CloneScopeList(CSNoAlias))
+      if (MDNode *NewScopeList = CloneScopeList(CSNoAlias))
         I->setMetadata(MD_ID, NewScopeList);
   };
   replaceWhenNeeded(LLVMContext::MD_noalias);
diff --git a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index d7bf791..fb39fdd 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -11,11 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 
@@ -112,7 +112,7 @@ struct BBValueInfo {
 void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
                                     SmallVectorImpl<PHINode *> *InsertedPHIs) {
   DenseMap<BasicBlock *, BBValueInfo> BBInfos;
-  for (auto &R : Rewrites) {
+  for (RewriteInfo &R : Rewrites) {
     BBInfos.clear();
 
     // Compute locations for new phi-nodes.
@@ -145,7 +145,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
       BBInfos[BB].LiveOutValue = V;
 
     // We've computed IDF, now insert new phi-nodes there.
-    for (auto *FrontierBB : IDFBlocks) {
+    for (BasicBlock *FrontierBB : IDFBlocks) {
       IRBuilder<> B(FrontierBB, FrontierBB->begin());
       PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
       BBInfos[FrontierBB].LiveInValue = PN;
@@ -156,7 +156,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
     // IsLiveOut indicates whether we are computing live-out values (true) or
     // live-in values (false).
     auto ComputeValue = [&](BasicBlock *BB, bool IsLiveOut) -> Value * {
-      auto *BBInfo = &BBInfos[BB];
+      BBValueInfo *BBInfo = &BBInfos[BB];
 
       if (IsLiveOut && BBInfo->LiveOutValue)
         return BBInfo->LiveOutValue;
@@ -187,7 +187,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
       if (!V)
         V = UndefValue::get(R.Ty);
 
-      for (auto *BBInfo : Stack)
+      for (BBValueInfo *BBInfo : Stack)
         // Loop above can insert new entries into the BBInfos map: assume the
         // map shouldn't grow due to [1] and BBInfo references are valid.
         BBInfo->LiveInValue = V;
@@ -196,7 +196,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
     };
 
     // Fill in arguments of the inserted PHIs.
-    for (auto *BB : IDFBlocks) {
+    for (BasicBlock *BB : IDFBlocks) {
       auto *PHI = cast<PHINode>(&BB->front());
       for (BasicBlock *Pred : PredCache.get(BB))
         PHI->addIncoming(ComputeValue(Pred, /*IsLiveOut=*/true), Pred);
@@ -222,3 +222,96 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
     }
   }
 }
+
+// Perform a single pass of simplification over the worklist of PHIs.
+// This should be called after RewriteAllUses() because simplifying PHIs
+// immediately after creation would require updating all references to those
+// PHIs in the BBValueInfo structures, which would necessitate additional
+// reference tracking overhead.
+static void simplifyPass(MutableArrayRef<PHINode *> Worklist,
+                         const DataLayout &DL) {
+  for (PHINode *&PHI : Worklist) {
+    if (Value *Simplified = simplifyInstruction(PHI, DL)) {
+      PHI->replaceAllUsesWith(Simplified);
+      PHI->eraseFromParent();
+      PHI = nullptr; // Mark as removed.
+    }
+  }
+}
+
+#ifndef NDEBUG // Should this be under EXPENSIVE_CHECKS?
+// New PHI nodes should not reference one another but they may reference
+// themselves or existing PHI nodes, and existing PHI nodes may reference new
+// PHI nodes.
+static bool
+PHIAreRefEachOther(const iterator_range<BasicBlock::phi_iterator> NewPHIs) {
+  SmallPtrSet<PHINode *, 8> NewPHISet;
+  for (PHINode &PN : NewPHIs)
+    NewPHISet.insert(&PN);
+  for (PHINode &PHI : NewPHIs) {
+    for (Value *V : PHI.incoming_values()) {
+      PHINode *IncPHI = dyn_cast<PHINode>(V);
+      if (IncPHI && IncPHI != &PHI && NewPHISet.contains(IncPHI))
+        return true;
+    }
+  }
+  return false;
+}
+#endif
+
+static bool replaceIfIdentical(PHINode &PHI, PHINode &ReplPHI) {
+  if (!PHI.isIdenticalToWhenDefined(&ReplPHI))
+    return false;
+  PHI.replaceAllUsesWith(&ReplPHI);
+  PHI.eraseFromParent();
+  return true;
+}
+
+bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
+                                   BasicBlock::phi_iterator FirstExistingPN) {
+  assert(!PHIAreRefEachOther(make_range(BB->phis().begin(), FirstExistingPN)));
+
+  // Deduplicate new PHIs first to reduce the number of comparisons on the
+  // following new -> existing pass.
+  bool Changed = false;
+  for (auto I = BB->phis().begin(); I != FirstExistingPN; ++I) {
+    for (auto J = std::next(I); J != FirstExistingPN;) {
+      Changed |= replaceIfIdentical(*J++, *I);
+    }
+  }
+
+  // Iterate over existing PHIs and replace identical new PHIs.
+  for (PHINode &ExistingPHI : make_range(FirstExistingPN, BB->phis().end())) {
+    auto I = BB->phis().begin();
+    assert(I != FirstExistingPN); // Should be at least one new PHI.
+    do {
+      Changed |= replaceIfIdentical(*I++, ExistingPHI);
+    } while (I != FirstExistingPN);
+    if (BB->phis().begin() == FirstExistingPN)
+      return Changed;
+  }
+  return Changed;
+}
+
+static void deduplicatePass(ArrayRef<PHINode *> Worklist) {
+  SmallDenseMap<BasicBlock *, unsigned> BBs;
+  for (PHINode *PHI : Worklist) {
+    if (PHI)
+      ++BBs[PHI->getParent()];
+  }
+
+  for (auto [BB, NumNewPHIs] : BBs) {
+    auto FirstExistingPN = std::next(BB->phis().begin(), NumNewPHIs);
+    EliminateNewDuplicatePHINodes(BB, FirstExistingPN);
+  }
+}
+
+void SSAUpdaterBulk::RewriteAndOptimizeAllUses(DominatorTree &DT) {
+  SmallVector<PHINode *, 4> PHIs;
+  RewriteAllUses(&DT, &PHIs);
+  if (PHIs.empty())
+    return;
+
+  simplifyPass(PHIs, PHIs.front()->getParent()->getDataLayout());
+  deduplicatePass(PHIs);
+}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6f4bec..88af2cf 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10659,7 +10659,8 @@ class InstructionsCompatibilityAnalysis {
   static bool isSupportedOpcode(const unsigned Opcode) {
     return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
            Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
-           Opcode == Instruction::UDiv;
+           Opcode == Instruction::UDiv || Opcode == Instruction::And ||
+           Opcode == Instruction::Or || Opcode == Instruction::Xor;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -10984,6 +10985,9 @@ public:
       case Instruction::Shl:
       case Instruction::SDiv:
       case Instruction::UDiv:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
@@ -19456,7 +19460,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
       assert(getNumElements(Cond->getType()) == TrueNumElements &&
              "Cannot vectorize Instruction::Select");
-      Value *V = Builder.CreateSelect(Cond, True, False);
+      Value *V =
+          Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
       V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
@@ -23576,18 +23581,19 @@ class HorizontalReduction {
     switch (Kind) {
     case RecurKind::Or: {
       if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
-        return Builder.CreateSelect(
+        return Builder.CreateSelectWithUnknownProfile(
             LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
-            RHS, Name);
+            RHS, DEBUG_TYPE, Name);
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     }
     case RecurKind::And: {
       if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
-        return Builder.CreateSelect(
+        return Builder.CreateSelectWithUnknownProfile(
             LHS, RHS,
-            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
+            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
+            DEBUG_TYPE, Name);
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
@@ -23608,7 +23614,8 @@ class HorizontalReduction {
       if (UseSelect) {
         CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);
         Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+        return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
+                                                      Name);
       }
       [[fallthrough]];
     case RecurKind::FMax:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fb696be..8ca3bed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1064,6 +1064,7 @@ public:
     ResumeForEpilogue,
     /// Returns the value for vscale.
     VScale,
+    OpsEnd = VScale,
   };
 
   /// Returns true if this VPInstruction generates scalar values for all lanes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 3e65d42..c0147ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -479,8 +479,7 @@ static void createExtractsForLiveOuts(VPlan &Plan, VPBasicBlock *MiddleVPBB) {
 
 static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
                                PredicatedScalarEvolution &PSE, Loop *TheLoop) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
 
   auto *HeaderVPBB = cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor());
   canonicalHeaderAndLatch(HeaderVPBB, VPDT);
@@ -622,8 +621,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
 }
 
 void VPlanTransforms::createLoopRegions(VPlan &Plan) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
   for (VPBlockBase *HeaderVPB : vp_post_order_shallow(Plan.getEntry()))
     if (canonicalHeaderAndLatch(HeaderVPB, VPDT))
       createLoopRegion(Plan, HeaderVPB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 577432f..44506f5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -39,7 +39,6 @@ class VPDominatorTree : public DominatorTreeBase<VPBlockBase, false> {
   using Base = DominatorTreeBase<VPBlockBase, false>;
 
 public:
-  VPDominatorTree() = default;
   explicit VPDominatorTree(VPlan &Plan) { recalculate(Plan); }
 
   /// Returns true if \p A properly dominates \p B.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index af755ca..40b7e8d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1909,8 +1909,7 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
 
 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
                                                   VPBuilder &LoopBuilder) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
 
   SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
   for (VPRecipeBase &R :
@@ -1992,6 +1991,13 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
         .Case<VPWidenIntrinsicRecipe>([](auto *I) {
           return std::make_pair(true, I->getVectorIntrinsicID());
         })
+        .Case<VPVectorPointerRecipe>([](auto *I) {
+          // For recipes that do not directly map to LLVM IR instructions,
+          // assign opcodes after the last VPInstruction opcode (which is also
+          // after the last IR Instruction opcode), based on the VPDefID.
+          return std::make_pair(false,
+                                VPInstruction::OpsEnd + 1 + I->getVPDefID());
+        })
         .Default([](auto *) { return std::nullopt; });
   }
 
@@ -2015,11 +2021,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
   static bool canHandle(const VPSingleDefRecipe *Def) {
     // We can extend the list of handled recipes in the future,
     // provided we account for the data embedded in them while checking for
-    // equality or hashing. We assign VPVectorEndPointerRecipe the GEP opcode,
-    // as it is essentially a GEP with different semantics.
-    auto C = isa<VPVectorPointerRecipe>(Def)
-                 ? std::make_pair(false, Instruction::GetElementPtr)
-                 : getOpcodeOrIntrinsicID(Def);
+    // equality or hashing.
+    auto C = getOpcodeOrIntrinsicID(Def);
 
     // The issue with (Insert|Extract)Value is that the index of the
     // insert/extract is not a proper operand in LLVM IR, and hence also not in
@@ -2058,6 +2061,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
         vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
         !equal(L->operands(), R->operands()))
       return false;
+    assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
+           "must have valid opcode info for both recipes");
     if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
       if (LFlags->hasPredicate() &&
           LFlags->getPredicate() !=
@@ -3021,8 +3026,7 @@ void VPlanTransforms::createInterleaveGroups(
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
   for (const auto *IG : InterleaveGroups) {
     auto *Start =
         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
@@ -3661,8 +3665,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
     return;
 
 #ifndef NDEBUG
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
 #endif
 
   SmallVector<VPValue *> VPValues;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 752e03d..5262af6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -489,8 +489,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
 }
 
 bool llvm::verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(const_cast<VPlan &>(Plan));
+  VPDominatorTree VPDT(const_cast<VPlan &>(Plan));
   VPTypeAnalysis TypeInfo(Plan);
   VPlanVerifier Verifier(VPDT, TypeInfo, VerifyLate);
   return Verifier.verify(Plan);
diff --git a/llvm/lib/XRay/BlockIndexer.cpp b/llvm/lib/XRay/BlockIndexer.cpp
index f4ba0eb..d0c6853 100644
--- a/llvm/lib/XRay/BlockIndexer.cpp
+++ b/llvm/lib/XRay/BlockIndexer.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/BlockIndexer.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BlockIndexer::visit(BufferExtents &) { return Error::success(); }
 
@@ -89,6 +89,3 @@ Error BlockIndexer::flush() {
   CurrentBlock.WallclockTime = nullptr;
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockPrinter.cpp b/llvm/lib/XRay/BlockPrinter.cpp
index 63a60c3..d85be5b 100644
--- a/llvm/lib/XRay/BlockPrinter.cpp
+++ b/llvm/lib/XRay/BlockPrinter.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/BlockPrinter.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BlockPrinter::visit(BufferExtents &R) {
   OS << "\n[New Block]\n";
@@ -108,6 +108,3 @@ Error BlockPrinter::visit(EndBufferRecord &R) {
     auto E = RP.visit(R);
     return E;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockVerifier.cpp b/llvm/lib/XRay/BlockVerifier.cpp
index 99f255e..e39f6b6 100644
--- a/llvm/lib/XRay/BlockVerifier.cpp
+++ b/llvm/lib/XRay/BlockVerifier.cpp
@@ -10,19 +10,18 @@
 
 #include <bitset>
 
-namespace llvm {
-namespace xray {
-namespace {
+using namespace llvm;
+using namespace llvm::xray;
 
-constexpr unsigned long long mask(BlockVerifier::State S) {
+static constexpr unsigned long long mask(BlockVerifier::State S) {
   return 1uLL << static_cast<std::size_t>(S);
 }
 
-constexpr std::size_t number(BlockVerifier::State S) {
+static constexpr std::size_t number(BlockVerifier::State S) {
   return static_cast<std::size_t>(S);
 }
 
-StringRef recordToString(BlockVerifier::State R) {
+static StringRef recordToString(BlockVerifier::State R) {
   switch (R) {
   case BlockVerifier::State::BufferExtents:
     return "BufferExtents";
@@ -53,6 +52,8 @@ StringRef recordToString(BlockVerifier::State R) {
   llvm_unreachable("Unkown state!");
 }
 
+namespace {
+
 struct Transition {
   BlockVerifier::State From;
   std::bitset<number(BlockVerifier::State::StateMax)> ToStates;
@@ -133,7 +134,7 @@ Error BlockVerifier::transition(State To) {
 
   CurrentRecord = To;
   return Error::success();
-} // namespace xray
+}
 
 Error BlockVerifier::visit(BufferExtents &) {
   return transition(State::BufferExtents);
@@ -201,6 +202,3 @@ Error BlockVerifier::verify() {
 }
 
 void BlockVerifier::reset() { CurrentRecord = State::Unknown; }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecordProducer.cpp b/llvm/lib/XRay/FDRRecordProducer.cpp
index 479b710..0f4eed1 100644
--- a/llvm/lib/XRay/FDRRecordProducer.cpp
+++ b/llvm/lib/XRay/FDRRecordProducer.cpp
@@ -10,8 +10,8 @@
 
 #include <cstdint>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 namespace {
 
@@ -31,8 +31,9 @@ enum MetadataRecordKinds : uint8_t {
   // This is an end marker, used to identify the upper bound for this enum.
   EnumEndMarker,
 };
+} // namespace
 
-Expected<std::unique_ptr<Record>>
+static Expected<std::unique_ptr<Record>>
 metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
 
   if (T >= static_cast<uint8_t>(MetadataRecordKinds::EnumEndMarker))
@@ -72,12 +73,10 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
   llvm_unreachable("Unhandled MetadataRecordKinds enum value");
 }
 
-constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
+static constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
   return FirstByte & 0x01u;
 }
 
-} // namespace
-
 Expected<std::unique_ptr<Record>>
 FileBasedRecordProducer::findNextBufferExtent() {
   // We seek one byte at a time until we find a suitable buffer extents metadata
@@ -193,6 +192,3 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
   assert(R != nullptr);
   return std::move(R);
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecords.cpp b/llvm/lib/XRay/FDRRecords.cpp
index ff315d3..a18f733 100644
--- a/llvm/lib/XRay/FDRRecords.cpp
+++ b/llvm/lib/XRay/FDRRecords.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BufferExtents::apply(RecordVisitor &V) { return V.visit(*this); }
 Error WallclockRecord::apply(RecordVisitor &V) { return V.visit(*this); }
@@ -61,6 +61,3 @@ StringRef Record::kindToString(RecordKind K) {
   }
   return "Unknown";
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceExpander.cpp b/llvm/lib/XRay/FDRTraceExpander.cpp
index b68e997..991e6e5 100644
--- a/llvm/lib/XRay/FDRTraceExpander.cpp
+++ b/llvm/lib/XRay/FDRTraceExpander.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRTraceExpander.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 void TraceExpander::resetCurrentRecord() {
   if (BuildingRecord)
@@ -126,6 +126,3 @@ Error TraceExpander::flush() {
   resetCurrentRecord();
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceWriter.cpp b/llvm/lib/XRay/FDRTraceWriter.cpp
index fb59125..3e320a6 100644
--- a/llvm/lib/XRay/FDRTraceWriter.cpp
+++ b/llvm/lib/XRay/FDRTraceWriter.cpp
@@ -12,8 +12,8 @@
 #include "llvm/XRay/FDRTraceWriter.h"
 #include <tuple>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 namespace {
 
@@ -37,9 +37,10 @@ template <size_t Index> struct IndexedWriter {
     return 0;
   }
 };
+} // namespace
 
 template <uint8_t Kind, class... Values>
-Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
+static Error writeMetadata(support::endian::Writer &OS, Values &&...Ds) {
   // The first bit in the first byte of metadata records is always set to 1, so
   // we ensure this is the case when we write out the first byte of the record.
   uint8_t FirstByte = (static_cast<uint8_t>(Kind) << 1) | uint8_t{0x01u};
@@ -54,8 +55,6 @@ Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
   return Error::success();
 }
 
-} // namespace
-
 FDRTraceWriter::FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H)
     : OS(O, llvm::endianness::native) {
   // We need to re-construct a header, by writing the fields we care about for
@@ -146,6 +145,3 @@ Error FDRTraceWriter::visit(FunctionRecord &R) {
   OS.write(R.delta());
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FileHeaderReader.cpp b/llvm/lib/XRay/FileHeaderReader.cpp
index 6b6daf9..681cef7 100644
--- a/llvm/lib/XRay/FileHeaderReader.cpp
+++ b/llvm/lib/XRay/FileHeaderReader.cpp
@@ -7,12 +7,13 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FileHeaderReader.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 // Populates the FileHeader reference by reading the first 32 bytes of the file.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint64_t &OffsetPtr) {
+Expected<XRayFileHeader>
+xray::readBinaryFormatHeader(DataExtractor &HeaderExtractor,
+                             uint64_t &OffsetPtr) {
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
@@ -68,6 +69,3 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   OffsetPtr += 16;
   return std::move(FileHeader);
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/LogBuilderConsumer.cpp b/llvm/lib/XRay/LogBuilderConsumer.cpp
index ffb49f9..f0fc336 100644
--- a/llvm/lib/XRay/LogBuilderConsumer.cpp
+++ b/llvm/lib/XRay/LogBuilderConsumer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecordConsumer.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error LogBuilderConsumer::consume(std::unique_ptr<Record> R) {
   if (!R)
@@ -32,6 +32,3 @@ Error PipelineConsumer::consume(std::unique_ptr<Record> R) {
     Result = joinErrors(std::move(Result), R->apply(*V));
   return Result;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Profile.cpp b/llvm/lib/XRay/Profile.cpp
index 1b340e5..ecb767b 100644
--- a/llvm/lib/XRay/Profile.cpp
+++ b/llvm/lib/XRay/Profile.cpp
@@ -18,8 +18,8 @@
 #include "llvm/XRay/Trace.h"
 #include <memory>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Profile::Profile(const Profile &O) {
   // We need to re-create all the tries from the original (O), into the current
@@ -46,6 +46,7 @@ struct BlockHeader {
   uint32_t Number;
   uint64_t Thread;
 };
+} // namespace
 
 static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
                                              uint64_t &Offset) {
@@ -115,8 +116,6 @@ static Expected<Profile::Data> readData(DataExtractor &Extractor,
   return D;
 }
 
-} // namespace
-
 Error Profile::addBlock(Block &&B) {
   if (B.PathData.empty())
     return make_error<StringError>(
@@ -189,7 +188,7 @@ Profile::PathID Profile::internPath(ArrayRef<FuncID> P) {
   return Node->ID;
 }
 
-Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByThread(const Profile &L, const Profile &R) {
   Profile Merged;
   using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
   using PathDataMapPtr = std::unique_ptr<PathDataMap>;
@@ -228,7 +227,7 @@ Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
   return Merged;
 }
 
-Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByStack(const Profile &L, const Profile &R) {
   Profile Merged;
   using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
   PathDataMap PathData;
@@ -258,7 +257,7 @@ Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
   return Merged;
 }
 
-Expected<Profile> loadProfile(StringRef Filename) {
+Expected<Profile> xray::loadProfile(StringRef Filename) {
   Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
   if (!FdOrErr)
     return FdOrErr.takeError();
@@ -322,7 +321,7 @@ struct StackEntry {
 
 } // namespace
 
-Expected<Profile> profileFromTrace(const Trace &T) {
+Expected<Profile> xray::profileFromTrace(const Trace &T) {
   Profile P;
 
   // The implementation of the algorithm re-creates the execution of
@@ -397,6 +396,3 @@ Expected<Profile> profileFromTrace(const Trace &T) {
 
   return P;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordInitializer.cpp b/llvm/lib/XRay/RecordInitializer.cpp
index 68ab3db..83d5f14 100644
--- a/llvm/lib/XRay/RecordInitializer.cpp
+++ b/llvm/lib/XRay/RecordInitializer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error RecordInitializer::visit(BufferExtents &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr, sizeof(uint64_t)))
@@ -426,6 +426,3 @@ Error RecordInitializer::visit(FunctionRecord &R) {
   assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordPrinter.cpp b/llvm/lib/XRay/RecordPrinter.cpp
index 32d4210..b9b7a16 100644
--- a/llvm/lib/XRay/RecordPrinter.cpp
+++ b/llvm/lib/XRay/RecordPrinter.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/Support/FormatVariadic.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error RecordPrinter::visit(BufferExtents &R) {
   OS << formatv("<Buffer: size = {0} bytes>", R.size()) << Delim;
@@ -103,6 +103,3 @@ Error RecordPrinter::visit(FunctionRecord &R) {
   OS << Delim;
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Trace.cpp b/llvm/lib/XRay/Trace.cpp
index 74515b1..14a3f01 100644
--- a/llvm/lib/XRay/Trace.cpp
+++ b/llvm/lib/XRay/Trace.cpp
@@ -29,11 +29,9 @@ using namespace llvm;
 using namespace llvm::xray;
 using llvm::yaml::Input;
 
-namespace {
-
-Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
-                         XRayFileHeader &FileHeader,
-                         std::vector<XRayRecord> &Records) {
+static Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
+                                XRayFileHeader &FileHeader,
+                                std::vector<XRayRecord> &Records) {
   if (Data.size() < 32)
     return make_error<StringError>(
         "Not enough bytes for an XRay log.",
@@ -265,8 +263,9 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
 /// what FunctionRecord instances use, and we no longer need to include the CPU
 /// id in the CustomEventRecord.
 ///
-Error loadFDRLog(StringRef Data, bool IsLittleEndian,
-                 XRayFileHeader &FileHeader, std::vector<XRayRecord> &Records) {
+static Error loadFDRLog(StringRef Data, bool IsLittleEndian,
+                        XRayFileHeader &FileHeader,
+                        std::vector<XRayRecord> &Records) {
 
   if (Data.size() < 32)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
@@ -348,8 +347,8 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
   return Error::success();
 }
 
-Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
-                  std::vector<XRayRecord> &Records) {
+static Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+                         std::vector<XRayRecord> &Records) {
   YAMLXRayTrace Trace;
   Input In(Data);
   In >> Trace;
@@ -376,7 +375,6 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
                  });
   return Error::success();
 }
-} // namespace
 
 Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
index e784d25..acac2c9 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
@@ -447,6 +447,84 @@ bb5:
   ret void
 }
 
+define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
+; X64-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X64-NEXT:  Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT:    %i4 = ptrtoint ptr %arg to i64
+; X64-NEXT:    --> (ptrtoint ptr %arg to i64) U: full-set S: full-set
+; X64-NEXT:    %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X64-NEXT:    --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i8 = load i8, ptr %i7, align 1
+; X64-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i9 = ptrtoint ptr %i7 to i64
+; X64-NEXT:    --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i10 = sub i64 %i9, %i4
+; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X64-NEXT:    --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i12 = load i8, ptr %i11, align 1
+; X64-NEXT:    --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i13 = add i8 %i12, %i8
+; X64-NEXT:    --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X64-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -1
+; X64-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT:  Loop %bb6: Trip multiple is 1
+;
+; X32-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X32-NEXT:  Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT:    %i4 = ptrtoint ptr %arg to i64
+; X32-NEXT:    --> (zext i32 (ptrtoint ptr %arg to i32) to i64) U: [0,4294967296) S: [0,4294967296)
+; X32-NEXT:    %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X32-NEXT:    --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i8 = load i8, ptr %i7, align 1
+; X32-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i9 = ptrtoint ptr %i7 to i64
+; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934591) S: [0,8589934591) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i10 = sub i64 %i9, %i4
+; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967296) S: [0,4294967296) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X32-NEXT:    --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i12 = load i8, ptr %i11, align 1
+; X32-NEXT:    --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i13 = add i8 %i12, %i8
+; X32-NEXT:    --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X32-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -1
+; X32-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT:  Loop %bb6: Trip multiple is 1
+;
+  %i = icmp eq ptr %arg1, %arg
+  br i1 %i, label %bb5, label %bb3
+
+bb3:
+  %i4 = ptrtoint ptr %arg to i64
+  br label %bb6
+
+bb6:
+  %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+  %i8 = load i8, ptr %i7
+  %i9 = ptrtoint ptr %i7 to i64
+  %i10 = sub i64 %i9, %i4
+  %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+  %i12 = load i8, ptr %i11
+  %i13 = add i8 %i12, %i8
+  store i8 %i13, ptr %i11
+  %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+  %i15 = icmp eq ptr %i14, %arg1
+  br i1 %i15, label %bb5, label %bb6
+
+bb5:
+  ret void
+}
+
+
 ; void pr46786_c26_int(int* start, int *end, int *other) {
 ;   for (int* cur = start; cur != end; ++cur)
 ;     other[cur - start] += *cur;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
index 8552931..ee35447 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
@@ -102,8 +102,8 @@ body:             |
   ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
   ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
     %0:_(<4 x s16>) = COPY $d0
-    %2:_(s16) = COPY $h0
-    %1:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
     %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
     %4:_(<4 x s16>) = G_ASHR %0, %3
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
index 61d1c43..97bcb80 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
@@ -135,8 +135,8 @@ body:             |
   ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
   ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
     %0:_(<4 x s16>) = COPY $d0
-    %2:_(s16) = COPY $h0
-    %1:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
     %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
     %4:_(<4 x s16>) = G_SHL %0, %3
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
new file mode 100644
index 0000000..332049d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
@@ -0,0 +1,276 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name:            Cst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @Cst
+  ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %2:_ KnownBits:00100010 SignBits:2
+    %0:_(s8) = G_CONSTANT i8 2
+    %1:_(s8) = G_CONSTANT i8 224
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstZero
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000001 SignBits:7
+  ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 1
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNegFour
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegFour
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000100 SignBits:5
+  ; CHECK-NEXT: %2:_ KnownBits:11111100 SignBits:6
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 4
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNeg
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNeg
+  ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:11011110 SignBits:2
+    %0:_(s8) = G_CONSTANT i8 224
+    %1:_(s8) = G_CONSTANT i8 2
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = COPY $b1
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 0
+    %4:_(s8) = G_SUB %3, %2
+...
+---
+name:            ScalarLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_SUB %1, %0
+...
+---
+name:            ScalarPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:3
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 5
+    %4:_(s8) = G_SUB %2, %3
+...
+---
+name:            VectorCstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstZero
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000000 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_SUB %1, %2
+...
+---
+name:            VectorCstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(s16) = G_CONSTANT i16 1
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %4:_(<4 x s16>) = G_SUB %2, %3
+...
+---
+name:            VectorVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s16>) = G_SUB %0, %1
+...
+---
+name:            VectorRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_SUB %2, %0
+...
+---
+name:            VectorNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %5:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 0
+    %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+    %6:_(<4 x s16>) = G_SUB %5, %3
+...
+---
+name:            VectorLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_SUB %0, %2
+...
+---
+name:            VectorPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+  ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+  ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+  ; CHECK-NEXT: %7:_ KnownBits:???????????????? SignBits:7
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 42
+    %5:_(s16) = G_CONSTANT i16 74
+    %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+    %7:_(<4 x s16>) = G_SUB %6, %3
+...
+---
+name:            VectorCst36
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst36
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:12
+    %0:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = G_CONSTANT i16 6
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %4:_(<4 x s16>) = G_SUB %2, %3
+...
+
+---
+name:            VectorCst3unknown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst3unknown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+    %4:_(<4 x s16>) = G_SUB %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll
index aa070b7..9b456a5 100644
--- a/llvm/test/CodeGen/AArch64/adds_cmn.ll
+++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll
@@ -22,10 +22,8 @@ entry:
 define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) {
 ; CHECK-LABEL: adds_cmn_c:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmn w0, w1
-; CHECK-NEXT:    add w1, w1, w0
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    adds w1, w0, w1
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index ecd48d6..149b4c4 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -290,8 +290,7 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
 define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmn w1, w0
+; CHECK-NEXT:    adds w8, w1, w0
 ; CHECK-NEXT:    csinv w0, w8, wzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i32 %y, -1
@@ -331,8 +330,7 @@ define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) {
 define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1
-; CHECK-NEXT:    cmn x1, x0
+; CHECK-NEXT:    adds x8, x1, x0
 ; CHECK-NEXT:    csinv x0, x8, xzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i64 %y, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f96a6f7..b239c46 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,13 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+; GCN-LABEL: kernel_ieee_mode_default:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+; GCN-LABEL: kernel_ieee_mode_on:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+; GCN-LABEL: kernel_ieee_mode_off:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 0
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_default() #0 {
+; GCN-LABEL: func_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_on() #1 {
+; GCN-LABEL: func_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_off() #2 {
+; GCN-LABEL: func_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_default() #0 {
+; GCN-LABEL: cs_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_on() #1 {
+; GCN-LABEL: cs_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_off() #2 {
+; GCN-LABEL: cs_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_default() #0 {
+; GCN-LABEL: ps_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_on() #1 {
+; GCN-LABEL: ps_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_off() #2 {
+; GCN-LABEL: ps_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
new file mode 100644
index 0000000..a4aad57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: buffer_load_lds_not_valu
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: buffer_load_lds_not_valu
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $exec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 0
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 1
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
+    ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+    ; CHECK-NEXT: S_ENDPGM 0
+    $exec = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sgpr_128 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
+    $m0 = S_MOV_B32 0
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    $m0 = S_MOV_B32 1
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 4, 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
index c8fee5d..7cbe5de 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
@@ -119,9 +119,10 @@ body:             |
     ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
     %2(s16) = G_CTLZ %1
 
-    ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
-    ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
-    ; CHECK: $r0 = COPY [[R]]
+    ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
+    ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+    ; LIBCALLS: $r0 = COPY [[R]]
+    ; CLZ: $r0 = COPY [[R32]]
     %3(s32) = G_SEXT %2(s16)
     $r0 = COPY %3(s32)
     BX_RET 14, $noreg, implicit $r0
diff --git a/llvm/test/CodeGen/ARM/carry.ll b/llvm/test/CodeGen/ARM/carry.ll
index 558e2b0..a652241 100644
--- a/llvm/test/CodeGen/ARM/carry.ll
+++ b/llvm/test/CodeGen/ARM/carry.ll
@@ -1,61 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
-; CHECK: subs r
-; CHECK: sbc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbc r1, r1, r3
+; CHECK-NEXT:    bx lr
 entry:
-	%tmp = sub i64 %a, %b
-	ret i64 %tmp
+  %tmp = sub i64 %a, %b
+  ret i64 %tmp
 }
 
 define i64 @f2(i64 %a, i64 %b) {
 ; CHECK-LABEL: f2:
-; CHECK: lsl  r
-; CHECK: orr  r
-; CHECK: rsbs r
-; CHECK: sbc  r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsl r1, r1, #1
+; CHECK-NEXT:    orr r1, r1, r0, lsr #31
+; CHECK-NEXT:    rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT:    sbc r1, r1, r3
+; CHECK-NEXT:    bx lr
 entry:
-        %tmp1 = shl i64 %a, 1
-	%tmp2 = sub i64 %tmp1, %b
-	ret i64 %tmp2
+  %tmp1 = shl i64 %a, 1
+  %tmp2 = sub i64 %tmp1, %b
+  ret i64 %tmp2
 }
 
 ; add with live carry
 define i64 @f3(i32 %al, i32 %bl) {
 ; CHECK-LABEL: f3:
-; CHECK: adds r
-; CHECK: adc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    adcs r0, r1, #0
+; CHECK-NEXT:    adc r1, r2, #0
+; CHECK-NEXT:    bx lr
 entry:
-        ; unsigned wide add
-        %aw = zext i32 %al to i64
-        %bw = zext i32 %bl to i64
-        %cw = add i64 %aw, %bw
-        ; ch == carry bit
-        %ch = lshr i64 %cw, 32
-	%dw = add i64 %ch, %bw
-	ret i64 %dw
+  ; unsigned wide add
+  %aw = zext i32 %al to i64
+  %bw = zext i32 %bl to i64
+  %cw = add i64 %aw, %bw
+  ; ch == carry bit
+  %ch = lshr i64 %cw, 32
+  %dw = add i64 %ch, %bw
+  ret i64 %dw
 }
 
 ; rdar://10073745
 define i64 @f4(i64 %x) nounwind readnone {
-entry:
 ; CHECK-LABEL: f4:
-; CHECK: rsbs r
-; CHECK: rsc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    rsc r1, r1, #0
+; CHECK-NEXT:    bx lr
+entry:
   %0 = sub nsw i64 0, %x
   ret i64 %0
 }
 
 ; rdar://12559385
 define i64 @f5(i32 %vi) {
-entry:
 ; CHECK-LABEL: f5:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
-    %v0 = zext i32 %vi to i64
-    %v1 = xor i64 %v0, -155057456198619
-    %v4 = add i64 %v1, 155057456198619
-    %v5 = add i64 %v4, %v1
-    ret i64 %v5
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movw r1, #19493
+; CHECK-NEXT:    movw r2, #29433
+; CHECK-NEXT:    movt r1, #57191
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    movw r3, #46043
+; CHECK-NEXT:    movt r2, #65535
+; CHECK-NEXT:    adds r0, r0, r0
+; CHECK-NEXT:    movw r1, #36102
+; CHECK-NEXT:    sbc r2, r2, r1
+; CHECK-NEXT:    movt r3, #8344
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc r1, r2, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %v0 = zext i32 %vi to i64
+  %v1 = xor i64 %v0, -155057456198619
+  %v4 = add i64 %v1, 155057456198619
+  %v5 = add i64 %v4, %v1
+  ret i64 %v5
 }
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
index 1edb387..f345e08 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 
 declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
index 2e80c4c..29b130f 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
 declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
index 817b1d5..4e463a14 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1(
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
index cbf647f..fc8cce4 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 declare void @llvm.nvvm.tcgen05.fence.before.thread.sync()
 declare void @llvm.nvvm.tcgen05.fence.after.thread.sync()
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
index a37b1a9..22eb729 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100f | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110f | %ptxas-verify -arch=sm_110f %}
 
 ; CHECK-LABEL: nvvm_tcgen05_ld_16x64b
 define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) {
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
index bf2adac..33483b5 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | %ptxas-verify -arch=sm_110a %}
 
 declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
 declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
index 0636a06..ccf6541 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 ; CHECK-LABEL: nvvm_tcgen05_st_16x64b
 define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
new file mode 100644
index 0000000..389283a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v \
+# RUN:     -run-pass=phi-node-elimination,register-coalescer,riscv-insert-vsetvli | FileCheck %s
+
+--- |
+  define void @xsfmm_same_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 noundef %tm, i64 noundef %tn, i64 noundef %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define void @xsfmm_different_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 4)
+    ret void
+  }
+
+  define void @xsfmm_different_state_bf(<vscale x 32 x half> %tile1, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64 2, <vscale x 32 x bfloat> %tile2, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %1
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm2(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %tile, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %2 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %2
+  }
+
+  define void @consecutive_xsfmm(<vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, ptr %base) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> %tile, <vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    call void @llvm.riscv.sf.vste16.i64(i64 0, ptr %base, i64 %tn)
+    ret void
+  }
+
+  define i64 @vsettnt_max(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    %1 = call i64 @llvm.riscv.sf.vsettnt_max.i64(i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettm(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettn(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettn.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettk(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettk.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define void @sf_vtzero(i64 %tm, i64 %tn) {
+  entry:
+    call void @llvm.riscv.sf.vtzero.i64(i64 1, i64 %tm, i64 %tn, i64 3, i64 4)
+    ret void
+  }
+
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64, <vscale x 32 x half>, <vscale x 32 x half>, i64, i64, i64, i64)
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i64, i64, i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i8>, i64)
+  declare void @llvm.riscv.sf.vste16.i64(i64, ptr, i64)
+  declare i64 @llvm.riscv.sf.vsettnt_max.i64(i64, i64)
+  declare i64 @llvm.riscv.sf.vsettm.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettn.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettk.i64(i64, i64, i64)
+  declare void @llvm.riscv.sf.vtzero.i64(i64, i64, i64, i64, i64)
+...
+---
+name:            xsfmm_same_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_same_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1544 /* e16, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 4, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 4, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state_bf
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state_bf
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1288 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F_ALT $t2, [[COPY3]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F_ALT $t2, %1:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            interleave_rvv_and_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[PseudoSF_VTMV_V_T]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %5:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %5:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            interleave_rvv_and_xsfmm2
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm2
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[COPY2]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_1:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[PseudoVADD_VV_M8_]], [[PseudoVADD_VV_M8_]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_1]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %0:vrm8, %1:gprnox0, 3, 0
+    %5:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %6:vrm8 = PseudoVADD_VV_M8 $noreg, %4:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %6:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            consecutive_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+  - { reg: '$x12', virtual-reg: '%3' }
+  - { reg: '$x13', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-LABEL: name: consecutive_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:gprnox0 = COPY $x13
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY2]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY1]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY3]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY]], [[COPY]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY3]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[COPY1]], [[COPY2]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:vrm8 = COPY $v8m8
+    %1:gprnox0 = COPY $x10
+    %2:gprnox0 = COPY $x11
+    %3:gprnox0 = COPY $x12
+    %4:gprnox0 = COPY $x13
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 2, implicit $frm
+    PseudoSF_VSTE16 %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 1
+    PseudoRET
+...
+---
+name:            vsettnt_max
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: vsettnt_max
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_1:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    %2:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    %3:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %3:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettm
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettn
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettn
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[PseudoSF_VSETTNT:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNT [[COPY]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTNT]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTNT %0:gprnox0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettk
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettk
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTK]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            sf_vtzero
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+  - { id: 1, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+  - { reg: '$x11', virtual-reg: '%1' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: sf_vtzero
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1536 /* e8, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY]], 3, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_VTZERO_T $t1, $noreg, $noreg, 3, 4, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = COPY $x11
+    PseudoSF_VTZERO_T $t1, %0:gprnox0, %1:gprnox0, 3, 4
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/Thumb2/carry.ll b/llvm/test/CodeGen/Thumb2/carry.ll
index 1e2b332..47c7918 100644
--- a/llvm/test/CodeGen/Thumb2/carry.ll
+++ b/llvm/test/CodeGen/Thumb2/carry.ll
@@ -1,35 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
-entry:
 ; CHECK-LABEL: f1:
-; CHECK: subs r0, r0, r2
-; CHECK: sbcs r1, r3
-	%tmp = sub i64 %a, %b
-	ret i64 %tmp
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbcs r1, r3
+; CHECK-NEXT:    bx lr
+entry:
+  %tmp = sub i64 %a, %b
+  ret i64 %tmp
 }
 
 define i64 @f2(i64 %a, i64 %b) {
-entry:
 ; CHECK-LABEL: f2:
-; CHECK: lsls  r1, r1, #1
-; CHECK: orr.w r1, r1, r0, lsr #31
-; CHECK: rsbs  r0, r2, r0, lsl #1
-; CHECK: sbcs  r1, r3
-        %tmp1 = shl i64 %a, 1
-	%tmp2 = sub i64 %tmp1, %b
-	ret i64 %tmp2
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    orr.w r1, r1, r0, lsr #31
+; CHECK-NEXT:    rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT:    sbcs r1, r3
+; CHECK-NEXT:    bx lr
+entry:
+  %tmp1 = shl i64 %a, 1
+  %tmp2 = sub i64 %tmp1, %b
+  ret i64 %tmp2
 }
 
 ; rdar://12559385
 define i64 @f3(i32 %vi) {
-entry:
 ; CHECK-LABEL: f3:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbcs r{{[0-9]+}}, [[REG]]
-    %v0 = zext i32 %vi to i64
-    %v1 = xor i64 %v0, -155057456198619
-    %v4 = add i64 %v1, 155057456198619
-    %v5 = add i64 %v4, %v1
-    ret i64 %v5
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movw r1, #19493
+; CHECK-NEXT:    movt r1, #57191
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    movw r2, #29433
+; CHECK-NEXT:    movw r3, #46043
+; CHECK-NEXT:    movw r1, #36102
+; CHECK-NEXT:    movt r2, #65535
+; CHECK-NEXT:    adds r0, r0, r0
+; CHECK-NEXT:    movt r3, #8344
+; CHECK-NEXT:    sbcs r2, r1
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %v0 = zext i32 %vi to i64
+  %v1 = xor i64 %v0, -155057456198619
+  %v4 = add i64 %v1, 155057456198619
+  %v5 = add i64 %v4, %v1
+  ret i64 %v5
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
new file mode 100644
index 0000000..3654aae
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_1:
+; CHECK:         .functype dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+
+define <4 x i32> @dot_sext_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_2:
+; CHECK:         .functype dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle2, %shuffle1
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @dot_sext_self(<8 x i16> %v) {
+; CHECK-LABEL: dot_sext_self:
+; CHECK:         .functype dot_sext_self (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %mul = mul <8 x i32> %sext, %sext
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_zext(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_zext:
+; CHECK:         .functype dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_u
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_u
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %zext1 = zext <8 x i16> %a to <8 x i32>
+  %zext2 = zext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %zext1, %zext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_wrong_shuffle(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_wrong_shuffle:
+; CHECK:         .functype dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_s
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de3..600241a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -2,9 +2,278 @@
 
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,              | FileCheck %s --check-prefix=STRICT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+simd128                     | FileCheck %s --check-prefix=NOFP16
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers                                      | FileCheck %s --check-prefix=NOSIMD
 
 target triple = "wasm32"
 
+define half @fadd_fmul_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f16:
+; RELAXED:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fadd_fmul_contract_f16:
+; STRICT:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fadd_fmul_contract_f16:
+; NOFP16:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f16:
+; NOSIMD:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %mul = fmul contract half %b, %a
+  %add = fadd contract half %mul, %c
+  ret half %add
+}
+
+define half @fmuladd_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_contract_f16:
+; RELAXED:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_contract_f16:
+; STRICT:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_contract_f16:
+; NOFP16:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_contract_f16:
+; NOSIMD:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call contract half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+define half @fmuladd_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_f16:
+; RELAXED:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_f16:
+; STRICT:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_f16:
+; NOFP16:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_f16:
+; NOSIMD:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+
+define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f32:
+; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fadd_fmul_contract_f32:
+; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $1, $0
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f32:
+; NOFP16:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f32:
+; NOSIMD:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %mul = fmul contract float %b, %a
+  %add = fadd contract float %mul, %c
+  ret float %add
+}
+
+define float @fmuladd_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_contract_f32:
+; RELAXED:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f32:
+; STRICT:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f32:
+; NOFP16:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f32:
+; NOSIMD:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
+define float @fmuladd_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_f32:
+; RELAXED:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f32:
+; STRICT:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f32:
+; NOFP16:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f32:
+; NOSIMD:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
 define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_f64:
 ; RELAXED:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
@@ -19,16 +288,94 @@ define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:    f64.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f64:
+; NOFP16:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f64:
+; NOSIMD:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
   %mul = fmul contract double %b, %a
   %add = fadd contract double %mul, %c
   ret double %add
 }
 
+define double @fmuladd_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_f64:
+; RELAXED:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f64:
+; STRICT:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f64:
+; NOFP16:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f64:
+; NOSIMD:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
+define double @fmuladd_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_contract_f64:
+; RELAXED:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f64:
+; STRICT:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f64:
+; NOFP16:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f64:
+; NOSIMD:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
 define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_4xf32:
 ; RELAXED:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_4xf32:
@@ -37,31 +384,222 @@ define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_4xf32:
+; NOFP16:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_4xf32:
+; NOSIMD:         .functype fadd_fmul_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul contract <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
-
 define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_8xf16:
 ; RELAXED:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_8xf16:
 ; STRICT:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.add $push1=, $pop0, $2
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.madd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf16:
+; NOFP16:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf16:
+; NOSIMD:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x half> %b, %a
   %add = fadd contract <8 x half> %mul, %c
   ret <8 x half> %add
 }
 
-
 define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_4xf32:
 ; RELAXED:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
@@ -76,16 +614,412 @@ define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_4xf32:
+; NOFP16:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_4xf32:
+; NOSIMD:         .functype fadd_fmul_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_contract_8xf16:
+; NOFP16:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_contract_8xf16:
+; NOSIMD:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_8xf16:
+; NOFP16:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf16:
+; NOSIMD:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
 define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_contract_4xf32:
 ; RELAXED:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_contract_4xf32:
@@ -94,18 +1028,40 @@ define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_4xf32:
+; NOFP16:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_4xf32:
+; NOSIMD:         .functype fmuladd_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
-; TODO: This should also have relaxed_madd in RELAXED case
 define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_4xf32:
 ; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT:    return $pop1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_4xf32:
 ; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -113,10 +1069,170 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_4xf32:
+; NOFP16:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_4xf32:
+; NOSIMD:         .functype fmuladd_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
+define <8 x float> @fmuladd_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fmuladd_8xf32:
+; RELAXED:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.mul $push0=, $2, $4
+; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $6
+; RELAXED-NEXT:    v128.store 16($0), $pop1
+; RELAXED-NEXT:    f32x4.mul $push2=, $1, $3
+; RELAXED-NEXT:    f32x4.add $push3=, $pop2, $5
+; RELAXED-NEXT:    v128.store 0($0), $pop3
+; RELAXED-NEXT:    return
+;
+; STRICT-LABEL: fmuladd_8xf32:
+; STRICT:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $2, $4
+; STRICT-NEXT:    f32x4.add $push1=, $pop0, $6
+; STRICT-NEXT:    v128.store 16($0), $pop1
+; STRICT-NEXT:    f32x4.mul $push2=, $1, $3
+; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
+; STRICT-NEXT:    v128.store 0($0), $pop3
+; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fmuladd_8xf32:
+; NOFP16:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $2, $4
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $1, $3
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf32:
+; NOSIMD:         .functype fmuladd_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $16
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $15
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $14
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $13
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $4, $12
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $3, $11
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $2, $10
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $1, $9
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
+  %fma = call <8 x float> @llvm.fmuladd(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  ret <8 x float> %fma
+}
+
+define <2 x double> @fmuladd_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_contract_2xf64:
+; RELAXED:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_2xf64:
+; STRICT:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_2xf64:
+; NOFP16:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_2xf64:
+; NOSIMD:         .functype fmuladd_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call contract <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_2xf64:
+; NOFP16:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_2xf64:
+; NOSIMD:         .functype fmuladd_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
 define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fma_4xf32:
 ; RELAXED:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
@@ -167,6 +1283,44 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; STRICT-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
 ; STRICT-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
 ; STRICT-NEXT:    return $pop19
+;
+; NOFP16-LABEL: fma_4xf32:
+; NOFP16:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.extract_lane $push2=, $0, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push1=, $1, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push0=, $2, 0
+; NOFP16-NEXT:    call $push3=, fmaf, $pop2, $pop1, $pop0
+; NOFP16-NEXT:    f32x4.splat $push4=, $pop3
+; NOFP16-NEXT:    f32x4.extract_lane $push7=, $0, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push6=, $1, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push5=, $2, 1
+; NOFP16-NEXT:    call $push8=, fmaf, $pop7, $pop6, $pop5
+; NOFP16-NEXT:    f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; NOFP16-NEXT:    f32x4.extract_lane $push12=, $0, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push11=, $1, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push10=, $2, 2
+; NOFP16-NEXT:    call $push13=, fmaf, $pop12, $pop11, $pop10
+; NOFP16-NEXT:    f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; NOFP16-NEXT:    f32x4.extract_lane $push17=, $0, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push16=, $1, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push15=, $2, 3
+; NOFP16-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
+; NOFP16-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; NOFP16-NEXT:    return $pop19
+;
+; NOSIMD-LABEL: fma_4xf32:
+; NOSIMD:         .functype fma_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $4, $8, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop0
+; NOSIMD-NEXT:    call $push1=, fmaf, $3, $7, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop1
+; NOSIMD-NEXT:    call $push2=, fmaf, $2, $6, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop2
+; NOSIMD-NEXT:    call $push3=, fmaf, $1, $5, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
@@ -176,9 +1330,9 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fadd_fmul_contract_8xf32:
 ; RELAXED:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -192,17 +1346,56 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
 ; STRICT-NEXT:    v128.store 0($0), $pop3
 ; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf32:
+; NOFP16:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $4, $2
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $3, $1
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf32:
+; NOSIMD:         .functype fadd_fmul_contract_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $16, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $15, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $14, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $13, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $12, $4
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $11, $3
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $10, $2
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $9, $1
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x float> %b, %a
   %add = fadd contract <8 x float> %mul, %c
   ret <8 x float> %add
 }
 
-
 define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_2xf64:
 ; RELAXED:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_2xf64:
@@ -211,28 +1404,64 @@ define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_2xf64:
+; NOFP16:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_2xf64:
+; NOSIMD:         .functype fadd_fmul_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %mul = fmul contract <2 x double> %b, %a
   %add = fadd contract <2 x double> %mul, %c
   ret <2 x double> %add
 }
 
-define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fadd_fmul_contract_f32:
-; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+define <2 x double> @fadd_fmul_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_2xf64:
+; RELAXED:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32.mul $push0=, $1, $0
-; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    f64x2.mul $push0=, $1, $0
+; RELAXED-NEXT:    f64x2.add $push1=, $pop0, $2
 ; RELAXED-NEXT:    return $pop1
 ;
-; STRICT-LABEL: fadd_fmul_contract_f32:
-; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-LABEL: fadd_fmul_2xf64:
+; STRICT:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f32.mul $push0=, $1, $0
-; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
-  %mul = fmul contract float %b, %a
-  %add = fadd contract float %mul, %c
-  ret float %add
+;
+; NOFP16-LABEL: fadd_fmul_2xf64:
+; NOFP16:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_2xf64:
+; NOSIMD:         .functype fadd_fmul_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %mul = fmul <2 x double> %b, %a
+  %add = fadd <2 x double> %mul, %c
+  ret <2 x double> %add
 }
 
 define float @fma_f32(float %a, float %b, float %c) {
@@ -247,6 +1476,18 @@ define float @fma_f32(float %a, float %b, float %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fmaf, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f32:
+; NOFP16:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f32:
+; NOSIMD:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call float @llvm.fma(float %a, float %b, float %c)
   ret float %fma
 }
@@ -263,6 +1504,18 @@ define double @fma_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fma, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f64:
+; NOFP16:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fma, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f64:
+; NOSIMD:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fma, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call double @llvm.fma(double %a, double %b, double %c)
   ret double %fma
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
index 6e2d860..b90c1da 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -27,7 +27,7 @@ define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; RELAXED-LABEL: fsub_fmul_contract_4xf32:
 ; RELAXED:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_4xf32:
@@ -46,15 +46,14 @@ define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
 ; RELAXED-LABEL: fsub_fmul_contract_8xf16:
 ; RELAXED:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_8xf16:
 ; STRICT:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.sub $push1=, $2, $pop0
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
   %mul = fmul contract <8 x half> %b, %a
   %sub = fsub contract <8 x half> %c, %mul
   ret <8 x half> %sub
@@ -84,9 +83,9 @@ define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fsub_fmul_contract_8xf32:
 ; RELAXED:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -110,7 +109,7 @@ define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; RELAXED-LABEL: fsub_fmul_contract_2xf64:
 ; RELAXED:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_2xf64:
@@ -143,3 +142,55 @@ define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
   ret float %sub
 }
 
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+  %fneg = fneg <8 x half> %a
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %fneg, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_4xf32:
+; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_4xf32:
+; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <4 x float> %a
+  %fma = call <4 x float> @llvm.fmuladd(<4 x float> %fneg, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <2 x double> %a
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %fneg, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 0de308a..5152c005 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -728,45 +728,70 @@ define void @avg_v32i8_2(ptr %a, ptr %b) nounwind {
 define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: avg_v64i8_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rsi), %xmm0
-; SSE2-NEXT:    movaps 16(%rsi), %xmm1
-; SSE2-NEXT:    movaps 32(%rsi), %xmm2
-; SSE2-NEXT:    movaps 48(%rsi), %xmm3
-; SSE2-NEXT:    movups %xmm3, (%rax)
-; SSE2-NEXT:    movups %xmm2, (%rax)
-; SSE2-NEXT:    movups %xmm1, (%rax)
-; SSE2-NEXT:    movups %xmm0, (%rax)
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
+; SSE2-NEXT:    pavgb (%rsi), %xmm0
+; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
+; SSE2-NEXT:    pavgb 32(%rsi), %xmm2
+; SSE2-NEXT:    pavgb 48(%rsi), %xmm3
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v64i8_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rsi), %ymm0
-; AVX1-NEXT:    vmovaps 32(%rsi), %ymm1
-; AVX1-NEXT:    vmovups %ymm1, (%rax)
-; AVX1-NEXT:    vmovups %ymm0, (%rax)
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vpavgb 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8_2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps (%rsi), %ymm0
-; AVX2-NEXT:    vmovaps 32(%rsi), %ymm1
-; AVX2-NEXT:    vmovups %ymm1, (%rax)
-; AVX2-NEXT:    vmovups %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: avg_v64i8_2:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps (%rsi), %zmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rax)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: avg_v64i8_2:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v64i8_2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, ptr %a
   %2 = load <64 x i8>, ptr %b
   %3 = zext <64 x i8> %1 to <64 x i32>
   %4 = zext <64 x i8> %2 to <64 x i32>
-  %5 = add nuw nsw <64 x i32> %4, %4
+  %5 = add nuw nsw <64 x i32> %3, %4
   %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %8 = trunc <64 x i32> %7 to <64 x i8>
@@ -774,7 +799,6 @@ define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
   ret void
 }
 
-
 define void @avg_v4i16_2(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: avg_v4i16_2:
 ; SSE2:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
index a0c243b..f3950b7 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -1,16 +1,15 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;; A minimal test case. llc will crash if global variables already has a section
-;; prefix. Subsequent PRs will expand on this test case to test the hotness
-;; reconciliation implementation.
-
-; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+;; A minimal test case. Subsequent PRs will expand on this test case
+;; (e.g., with more functions, variables and profiles) and test the hotness
+;; reconcillation implementation.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
 ; RUN:     -partition-static-data-sections=true \
 ; RUN:     -data-sections=true  -unique-section-names=false \
-; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=IR
 
-; ERR: Global variable hot_bss already has a section prefix hot
+; IR: .section .bss.hot.,"aw"
 
 @hot_bss = internal global i32 0, !section_prefix !17
 
diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll
index ce06d17..604b4fd 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition.ll
@@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu"
 ; UNIQ-NEXT:   .section	.data.unlikely.,"aw",@progbits,unique,8
 ; AGG-NEXT:    .section	.data.unlikely.,"aw",@progbits
 
+;; The `.section` directive is omitted for .data with -unique-section-names=false.
+; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; For @data_with_unknown_hotness
 ; SYM: 	       .type	.Ldata_with_unknown_hotness,@object          # @data_with_unknown_hotness
 ; SYM:         .section .data..Ldata_with_unknown_hotness,"aw",@progbits
 ; UNIQ:        .section  .data,"aw",@progbits,unique,9
-; The `.section` directive is omitted for .data with -unique-section-names=false.
-; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; AGG:         .data
 ; COMMON:      .Ldata_with_unknown_hotness:
 
-; For @hot_data_custom_bar_section
-; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix.
+; For variables that are not eligible for section prefix annotation
 ; COMMON:      .type hot_data_custom_bar_section,@object
 ; SYM-NEXT:    .section bar,"aw",@progbits
 ; SYM:         hot_data_custom_bar_section
 ; UNIQ:        .section bar,"aw",@progbits
 ; AGG:         .section bar,"aw",@progbits
 
+; SYM:      .section .data.llvm.fake_var,"aw"
+; UNIQ:     .section .data,"aw"
+; AGG:      .data
+
+;; No section for linker declaration
+; COMMON-NOT:  qux
+
 @.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1
 @.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1
 @hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3]
@@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @data3 = internal global i32 3
 @data_with_unknown_hotness = private global i32 5
 @hot_data_custom_bar_section = internal global i32 101 #0
+@llvm.fake_var = internal global i32 123
+@qux = external global i64
 
 define void @cold_func(i32 %0) !prof !15 {
   %2 = load i32, ptr @cold_bss
diff --git a/llvm/test/CodeGen/X86/relptr-rodata.ll b/llvm/test/CodeGen/X86/relptr-rodata.ll
index ea22b08..954ea8f 100644
--- a/llvm/test/CodeGen/X86/relptr-rodata.ll
+++ b/llvm/test/CodeGen/X86/relptr-rodata.ll
@@ -10,16 +10,31 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: .long hidden-rodata
 @rodata = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @rodata to i64)) to i32)
 
+; CHECK: .section .rodata.rodata_ptrtoaddr
+; CHECK: rodata_ptrtoaddr:
+; CHECK: .long hidden-rodata_ptrtoaddr
+@rodata_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @rodata_ptrtoaddr to i64)) to i32)
+
 ; CHECK: .section .data.rel.ro.relro1
 ; CHECK: relro1:
 ; CHECK: .long default-relro1
 @relro1 = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @default to i64), i64 ptrtoint (ptr @relro1 to i64)) to i32)
 
+; CHECK: .section .data.rel.ro.relro1_ptrtoaddr
+; CHECK: relro1_ptrtoaddr:
+; CHECK: .long default-relro1_ptrtoaddr
+@relro1_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @default to i64), i64 ptrtoaddr (ptr @relro1_ptrtoaddr to i64)) to i32)
+
 ; CHECK: .section .data.rel.ro.relro2
 ; CHECK: relro2:
 ; CHECK: .long hidden-relro2
 @relro2 = constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @relro2 to i64)) to i32)
 
+; CHECK: .section .data.rel.ro.relro2_ptrtoaddr
+; CHECK: relro2_ptrtoaddr:
+; CHECK: .long hidden-relro2_ptrtoaddr
+@relro2_ptrtoaddr = constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @relro2_ptrtoaddr to i64)) to i32)
+
 ; CHECK:      .section .rodata.obj
 ; CHECK-NEXT: .globl obj
 ; CHECK:      obj:
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 5aa266d..69abf6e 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1447,3 +1447,158 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
   %r = icmp eq i512 %a, %b
   ret i1 %r
 }
+
+; Tests for any/allbits from memory.
+
+define i1 @anybits_i128_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i128_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    orq 8(%rdi), %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp ne i128 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i128_load_arg(ptr %w) {
+; SSE2-LABEL: allbits_i128_load_arg:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqb (%rdi), %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: allbits_i128_load_arg:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    ptest %xmm1, %xmm0
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    retq
+;
+; AVXANY-LABEL: allbits_i128_load_arg:
+; AVXANY:       # %bb.0:
+; AVXANY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVXANY-NEXT:    vptest %xmm1, %xmm0
+; AVXANY-NEXT:    setb %al
+; AVXANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp eq i128 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i256_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i256_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    movq 8(%rdi), %rcx
+; ANY-NEXT:    orq 24(%rdi), %rcx
+; ANY-NEXT:    orq 16(%rdi), %rax
+; ANY-NEXT:    orq %rcx, %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp ne i256 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i256_load_arg(ptr %w) {
+; SSE-LABEL: allbits_i256_load_arg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    andq 24(%rdi), %rcx
+; SSE-NEXT:    andq 16(%rdi), %rax
+; SSE-NEXT:    andq %rcx, %rax
+; SSE-NEXT:    cmpq $-1, %rax
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: allbits_i256_load_arg:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vptest %ymm1, %ymm0
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: allbits_i256_load_arg:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vptest %ymm1, %ymm0
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i256_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp eq i256 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i512_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i512_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq 16(%rdi), %rax
+; ANY-NEXT:    movq (%rdi), %rcx
+; ANY-NEXT:    movq 8(%rdi), %rdx
+; ANY-NEXT:    movq 24(%rdi), %rsi
+; ANY-NEXT:    orq 56(%rdi), %rsi
+; ANY-NEXT:    orq 40(%rdi), %rdx
+; ANY-NEXT:    orq %rsi, %rdx
+; ANY-NEXT:    orq 48(%rdi), %rax
+; ANY-NEXT:    orq 32(%rdi), %rcx
+; ANY-NEXT:    orq %rax, %rcx
+; ANY-NEXT:    orq %rdx, %rcx
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp ne i512 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i512_load_arg(ptr %w) {
+; NO512-LABEL: allbits_i512_load_arg:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 16(%rdi), %rax
+; NO512-NEXT:    movq (%rdi), %rcx
+; NO512-NEXT:    movq 8(%rdi), %rdx
+; NO512-NEXT:    movq 24(%rdi), %rsi
+; NO512-NEXT:    andq 56(%rdi), %rsi
+; NO512-NEXT:    andq 40(%rdi), %rdx
+; NO512-NEXT:    andq %rsi, %rdx
+; NO512-NEXT:    andq 48(%rdi), %rax
+; NO512-NEXT:    andq 32(%rdi), %rcx
+; NO512-NEXT:    andq %rax, %rcx
+; NO512-NEXT:    andq %rdx, %rcx
+; NO512-NEXT:    cmpq $-1, %rcx
+; NO512-NEXT:    sete %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i512_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
+; AVX512-NEXT:    vpcmpneqd (%rdi), %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp eq i512 %ld, -1
+  ret i1 %cmp
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4b..57da338 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -917,11 +917,11 @@ main:
     # CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02]
     f16x8.nearest
 
-    # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02]
-    f16x8.relaxed_madd
+    # CHECK: f16x8.madd # encoding: [0xfd,0xce,0x02]
+    f16x8.madd
 
-    # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02]
-    f16x8.relaxed_nmadd
+    # CHECK: f16x8.nmadd # encoding: [0xfd,0xcf,0x02]
+    f16x8.nmadd
 
     # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02]
     i16x8.trunc_sat_f16x8_s
diff --git a/llvm/test/Other/debugcounter-dce.ll b/llvm/test/Other/debugcounter-dce.ll
index 54d929f..3b1dfb4 100644
--- a/llvm/test/Other/debugcounter-dce.ll
+++ b/llvm/test/Other/debugcounter-dce.ll
@@ -1,8 +1,16 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2  < %s | FileCheck %s
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2  < %s | FileCheck %s --check-prefixes=CHECK,NO-PRINT
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 -print-debug-counter-queries < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
 ;; Test that, with debug counters on, we will skip the first DCE opportunity, perform next 2,
 ;; and ignore all the others left.
 
+; NO-PRINT-NOT: DebugCounter
+; PRINT: DebugCounter dce-transform=0 skip
+; PRINT-NEXT: DebugCounter dce-transform=1 execute
+; PRINT-NEXT: DebugCounter dce-transform=2 execute
+; PRINT-NEXT: DebugCounter dce-transform=3 skip
+; PRINT-NEXT: DebugCounter dce-transform=4 skip
+
 ; CHECK-LABEL: @test
 ; CHECK-NEXT: %add1 = add i32 1, 2
 ; CHECK-NEXT: %sub1 = sub i32 %add1, 1
diff --git a/llvm/test/TableGen/listsplat.td b/llvm/test/TableGen/listsplat.td
index 5a93a4c..43803d6 100644
--- a/llvm/test/TableGen/listsplat.td
+++ b/llvm/test/TableGen/listsplat.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// RUN: not llvm-tblgen -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s
 
 // CHECK: ------------- Classes -----------------
 // CHECK-NEXT: class X<int X:a = ?, int X:b = ?> {
@@ -73,3 +74,8 @@ def DYa1 : Y<"a", 1>;
 def DYa2 : Y<"a", 2>;
 
 def DZ : X<42, !size([1, 2, 3])>;
+
+#ifdef ERROR1
+// ERROR1: !listsplat count -1 is negative
+defvar E = !listsplat("", -1);
+#endif
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index fae1365..e1d39fd 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -99,12 +99,15 @@ define float @test_3(i32 %a, i32 %b) {
   ret float %p
 }
 
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
 define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @test_4(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 1073741823)
 ; CHECK-NEXT:    [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], splat (i32 1073741823)
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]]
-; CHECK-NEXT:    [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x double>
+; CHECK-NEXT:    [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x double> [[A_AND_FP]], [[B_AND_FP]]
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + %b doesn't overflow
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index 702bbbb..57184ea 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -1063,6 +1063,25 @@ define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345)
   ret float %mul3.i.i
 }
 
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
+define <2 x half> @test_ui_ui_i8_mul_vec(<2 x i8> noundef %x_in, <2 x i8> noundef %y_in) {
+; CHECK-LABEL: @test_ui_ui_i8_mul_vec(
+; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[X_IN:%.*]], splat (i8 15)
+; CHECK-NEXT:    [[Y:%.*]] = and <2 x i8> [[Y_IN:%.*]], splat (i8 15)
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg <2 x i8> [[X]] to <2 x half>
+; CHECK-NEXT:    [[YF:%.*]] = uitofp nneg <2 x i8> [[Y]] to <2 x half>
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x half> [[XF]], [[YF]]
+; CHECK-NEXT:    ret <2 x half> [[R]]
+;
+  %x = and <2 x i8> %x_in, splat (i8 15)
+  %y = and <2 x i8> %y_in, splat (i8 15)
+  %xf = uitofp <2 x i8> %x to <2 x half>
+  %yf = uitofp <2 x i8> %y to <2 x half>
+  %r = fmul <2 x half> %xf, %yf
+  ret <2 x half> %r
+}
+
 define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_poison(i1 %c, i1 %.b, ptr %g_2345) {
 ; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_poison(
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
@@ -1091,8 +1110,9 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_poison(i1 %c,
 ; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
 ; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MUL3_I_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I1:%.*]] = fmul <2 x float> [[MUL3_I_I]], <float poison, float 1.000000e+00>
 ; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
-; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I1]]
 ;
   %sel = select i1 %c, i32 65529, i32 53264
   %conv.i.s = trunc i32 %sel to i16
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 7b0b152..ffaa8b1 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -23,10 +23,7 @@ define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
 define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
 ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
 ; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A]] to i64
-; CHECK-NEXT:    [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32
-; CHECK-NEXT:    ret i32 [[TOADDR]]
+; CHECK-NEXT:    ret i32 [[A]]
 ;
   %toptr = inttoptr i32 %a to ptr addrspace(1)
   %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 9ed2240..9357adf 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -273,3 +273,106 @@ loop:
 exit:
   ret void
 }
+
+define void @ld_div2_ld_scevunknown_nonuniform(ptr %src.a, ptr noalias %src.b, ptr noalias %dst) {
+; CHECK-LABEL: define void @ld_div2_ld_scevunknown_nonuniform
+; CHECK-SAME: (ptr [[SRC_A:%.*]], ptr noalias [[SRC_B:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 5
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 6
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 7
+; CHECK-NEXT:    [[TMP32:%.*]] = udiv <8 x i64> [[TMP31]], splat (i64 2)
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i64> [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP36]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP38]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP40]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP42]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP48]], align 4
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <8 x i32> poison, i32 [[TMP49]], i32 0
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <8 x i32> [[TMP58]], i32 [[TMP51]], i32 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <8 x i32> [[TMP59]], i32 [[TMP52]], i32 3
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <8 x i32> [[TMP60]], i32 [[TMP53]], i32 4
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP54]], i32 5
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <8 x i32> [[TMP62]], i32 [[TMP55]], i32 6
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP56]], i32 7
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    store <8 x i32> [[TMP64]], ptr [[TMP65]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
+; CHECK:       scalar.ph:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i32, ptr %src.a, i64 %iv
+  %load.a = load i64, ptr %gep.a
+  %d = udiv i64 %load.a, 2
+  %gep.b = getelementptr i32, ptr %src.b, i64 %d
+  %load.b = load i32, ptr %gep.b
+  %gep.dst = getelementptr i32, ptr %dst, i64 %iv
+  store i32 %load.b, ptr %gep.dst
+  %iv.next = add i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv, 1000
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
new file mode 100644
index 0000000..d281905
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -0,0 +1,539 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
+
+; REQUIRES: aarch64-registered-target
+
+; See the comment in `data-layout.ll` for an explanation.
+
+target triple = "aarch64-unknown-unknown"
+
+define void @multiply(ptr %A, ptr %B, ptr %C) {
+; PTR128-LABEL: @multiply(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128
+; PTR128-NEXT:    [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128
+; PTR128-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128
+; PTR128-NEXT:    [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR128-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR128:       alias_cont:
+; PTR128-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128
+; PTR128-NEXT:    [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR128-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR128:       copy:
+; PTR128-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT:    br label [[NO_ALIAS]]
+; PTR128:       no_alias:
+; PTR128-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR128-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128
+; PTR128-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128
+; PTR128-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128
+; PTR128-NEXT:    [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR128-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR128:       alias_cont1:
+; PTR128-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128
+; PTR128-NEXT:    [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR128-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR128:       copy2:
+; PTR128-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT:    br label [[NO_ALIAS3]]
+; PTR128:       no_alias3:
+; PTR128-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR128-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR128-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR128-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR128-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR128-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR128-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR128-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR128-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR128-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR128-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR128-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR128-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32
+; PTR128-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR128-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR128-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR128-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR128-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR128-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR128-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR128-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR128-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR128-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR128-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR128-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR128-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR128-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR128-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16
+; PTR128-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR128-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48
+; PTR128-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR128-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR128-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR128-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR128-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR128-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR128-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR128-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR128-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR128-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR128-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR128-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR128-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR128-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR128-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64
+; PTR128-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR128-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96
+; PTR128-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR128-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR128-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR128-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR128-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR128-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR128-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR128-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR128-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR128-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR128-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR128-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR128-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR128-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR128-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR128-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80
+; PTR128-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR128-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112
+; PTR128-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR128-NEXT:    ret void
+;
+; PTR64-LABEL: @multiply(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; PTR64-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128
+; PTR64-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; PTR64-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR64-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR64:       alias_cont:
+; PTR64-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128
+; PTR64-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR64-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR64:       copy:
+; PTR64-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT:    br label [[NO_ALIAS]]
+; PTR64:       no_alias:
+; PTR64-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR64-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64
+; PTR64-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128
+; PTR64-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i64
+; PTR64-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR64-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR64:       alias_cont1:
+; PTR64-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128
+; PTR64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR64-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR64:       copy2:
+; PTR64-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT:    br label [[NO_ALIAS3]]
+; PTR64:       no_alias3:
+; PTR64-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR64-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR64-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR64-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR64-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR64-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR64-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR64-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR64-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR64-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR64-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR64-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR64-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32
+; PTR64-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR64-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR64-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR64-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR64-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR64-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR64-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR64-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR64-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR64-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR64-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR64-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR64-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR64-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16
+; PTR64-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR64-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48
+; PTR64-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR64-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR64-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR64-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR64-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR64-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR64-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR64-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR64-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR64-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR64-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR64-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR64-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR64-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR64-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64
+; PTR64-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR64-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96
+; PTR64-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR64-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR64-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR64-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR64-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR64-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR64-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR64-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR64-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR64-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR64-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR64-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR64-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR64-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR64-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR64-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; PTR64-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR64-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
+; PTR64-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR64-NEXT:    ret void
+;
+; PTR32-LABEL: @multiply(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i32
+; PTR32-NEXT:    [[STORE_END:%.*]] = add nuw nsw i32 [[STORE_BEGIN]], 128
+; PTR32-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i32
+; PTR32-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR32-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR32:       alias_cont:
+; PTR32-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i32 [[LOAD_BEGIN]], 128
+; PTR32-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR32-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR32:       copy:
+; PTR32-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT:    br label [[NO_ALIAS]]
+; PTR32:       no_alias:
+; PTR32-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR32-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i32
+; PTR32-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i32 [[STORE_BEGIN4]], 128
+; PTR32-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i32
+; PTR32-NEXT:    [[TMP4:%.*]] = icmp ugt i32 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR32-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR32:       alias_cont1:
+; PTR32-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i32 [[LOAD_BEGIN6]], 128
+; PTR32-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR32-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR32:       copy2:
+; PTR32-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT:    br label [[NO_ALIAS3]]
+; PTR32:       no_alias3:
+; PTR32-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR32-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR32-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR32-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR32-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR32-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR32-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR32-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR32-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR32-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR32-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR32-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR32-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i32 32
+; PTR32-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR32-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR32-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR32-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR32-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR32-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR32-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR32-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR32-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR32-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR32-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR32-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR32-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR32-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR32-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i32 16
+; PTR32-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR32-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i32 48
+; PTR32-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR32-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR32-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR32-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR32-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR32-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR32-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR32-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR32-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR32-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR32-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR32-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR32-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR32-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR32-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i32 64
+; PTR32-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR32-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i32 96
+; PTR32-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR32-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR32-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR32-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR32-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR32-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR32-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR32-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR32-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR32-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR32-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR32-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR32-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR32-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR32-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR32-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i32 80
+; PTR32-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR32-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112
+; PTR32-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR32-NEXT:    ret void
+;
+entry:
+  %a = load <16 x double>, ptr %A, align 8
+  %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4)
+  store <16 x double> %c, ptr %C, align 8
+  ret void
+}
+
+declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
new file mode 100644
index 0000000..87def6b
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -0,0 +1,312 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
+
+; To properly support the matrix intrinsics on, e.g., 32-bit platforms (without
+; the need to emit `libc` calls), we perform strided index calculations using
+; the same pointer bit-width as the matrix pointers, as determined by the data
+; layout. To verify this behaviour, this test runs several strided loads and
+; stores through the lowering pass with (32|64|128)-bit pointers, and verifies
+; the generated code extends / truncates strides accordingly. Similarly,
+; `data-layout-multiply-fused.ll` adopts this approach to verify the same
+; behaviour for index calculations emitted while lowering fused matrix
+; multiplies.
+
+define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i128(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]]
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]]
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]]
+; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i128(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64
+; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i128(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32
+; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i64(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128
+; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i64(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
+; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i64(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[STRIDE_CAST:%.*]] = trunc i64 [[STRIDE:%.*]] to i32
+; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 %stride, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 16, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i32(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128
+; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i32(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i32(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE]]
+; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 %stride, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 16, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
index ae7da19..abc4705 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
@@ -62,11 +62,12 @@ declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32,
 define <8 x double> @strided_load_4x2_stride_i32(ptr %in, i32 %stride) {
 ; CHECK-LABEL: @strided_load_4x2_stride_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x double> [[TMP0]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
index 28e9cdb..81b8507 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
@@ -34,11 +34,12 @@ define void @strided_store_3x2_nonconst_i32_stride(<6 x double> %in, i32 %stride
 ; CHECK-LABEL: @strided_store_3x2_nonconst_i32_stride(
 ; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[VEC_START]]
 ; CHECK-NEXT:    store <3 x double> [[SPLIT]], ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i64 [[VEC_START2]]
 ; CHECK-NEXT:    store <3 x double> [[SPLIT1]], ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PGOProfile/data-access-profile.ll b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
index 29198f34..205184b 100644
--- a/llvm/test/Transforms/PGOProfile/data-access-profile.ll
+++ b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
@@ -3,55 +3,72 @@
 
 ; RUN: rm -rf %t && split-file %s %t && cd %t
 
-;; Read a text profile and merge it into indexed profile.
+;; Read text profiles and merge them into indexed profiles.
 ; RUN: llvm-profdata merge --memprof-version=4 memprof.yaml -o memprof.profdata
+; RUN: llvm-profdata merge --memprof-version=4 memprof-no-dap.yaml -o memprof-no-dap.profdata
 
 ;; Run optimizer pass on an IR module without IR functions, and test that global
 ;; variables in the module could be annotated (i.e., no early return),
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
 
 ;; Run optimizer pass on the IR, and check the section prefix.
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
 
-;; Run optimizer pass without explicitly setting -memprof-annotate-static-data-prefix.
-;; The output text IR shouldn't have `section_prefix`
+;; Run memprof without providing memprof data. Test that IR has module flag
+;; `EnableDataAccessProf` as 0.
+; RUN: opt -passes='memprof-use<profile-filename=memprof-no-dap.profdata>' -memprof-annotate-static-data-prefix \
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefix=FLAG
+
+;; Run memprof without explicitly setting -memprof-annotate-static-data-prefix.
+;; The output text IR shouldn't have `section_prefix` or EnableDataAccessProf module flag.
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' \
-; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --implicit-check-not="section_prefix"
+; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --check-prefix=FLAGLESS --implicit-check-not="section_prefix"
 
 ; LOG: Skip annotating string literal .str
 ; LOG: Global variable var1 is annotated as hot
 ; LOG: Global variable var2.llvm.125 is annotated as hot
 ; LOG: Global variable bar is not annotated
 ; LOG: Global variable foo is annotated as unlikely
-; LOG: Global variable var3 has explicit section name. Skip annotating.
-; LOG: Global variable var4 has explicit section name. Skip annotating.
+; LOG: Skip annotation for var3 due to explicit section name.
+; LOG: Skip annotation for var4 due to explicit section name.
+; LOG: Skip annotation for llvm.fake_var due to name starts with `llvm.`.
+; LOG: Skip annotation for qux due to linker declaration.
 
 ;; String literals are not annotated.
-; PREFIX: @.str = unnamed_addr constant [5 x i8] c"abcde"
-; PREFIX-NOT: section_prefix
-; PREFIX: @var1 = global i32 123, !section_prefix !0
+; IR: @.str = unnamed_addr constant [5 x i8] c"abcde"
+; IR-NOT: section_prefix
+; IR: @var1 = global i32 123, !section_prefix !0
 
 ;; @var.llvm.125 will be canonicalized to @var2 for profile look-up.
-; PREFIX-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
+; IR-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
 
 ;; @bar is not seen in hot symbol or known symbol set, so it won't get a section
 ;; prefix. Test this by testing that there is no section_prefix between @bar and
 ;; @foo.
-; PREFIX-NEXT: @bar = global i16 3
-; PREFIX-NOT: !section_prefix
+; IR-NEXT: @bar = global i16 3
+; IR-NOT: !section_prefix
 
 ;; @foo is unlikely.
-; PREFIX-NEXT: @foo = global i8 2, !section_prefix !1
+; IR-NEXT: @foo = global i8 2, !section_prefix !1
+
+; IR-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
+; IR-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+
+; IR: @llvm.fake_var = global i32 123
+; IR-NOT: !section_prefix
+; IR: @qux = external global i64
+; IR-NOT: !section_prefix
 
-; PREFIX-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
-; PREFIX-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+; IR: attributes #0 = { "rodata-section"="sec2" }
 
-; PREFIX: attributes #0 = { "rodata-section"="sec2" }
+; IR: !0 = !{!"section_prefix", !"hot"}
+; IR-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; IR-NEXT: !2 = !{i32 2, !"EnableDataAccessProf", i32 1}
 
-; PREFIX: !0 = !{!"section_prefix", !"hot"}
-; PREFIX-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; FLAG: !{i32 2, !"EnableDataAccessProf", i32 0}
+; FLAGLESS-NOT: EnableDataAccessProf
 
 ; STAT: 1 memprof - Number of global vars annotated with 'unlikely' section prefix.
 ; STAT: 2 memprof - Number of global vars with user-specified section (not annotated).
@@ -72,6 +89,24 @@ DataAccessProfiles:
     - foo
   KnownColdStrHashes: [ 999, 1001 ]
 ...
+;--- memprof-no-dap.yaml
+---
+# A memprof file with without data access profiles. The heap records are simplified
+# to pass profile parsing and don't need to match the IR.
+HeapProfileRecords:
+  - GUID:            0xdeadbeef12345678
+    AllocSites:
+      - Callstack:
+          - { Function: 0x1111111111111111, LineOffset: 11, Column: 10, IsInlineFrame: true }
+        MemInfoBlock:
+          AllocCount:      111
+          TotalSize:       222
+          TotalLifetime:   333
+          TotalLifetimeAccessDensity: 444
+    CallSites:
+      - Frames:
+        - { Function: 0x5555555555555555, LineOffset: 55, Column: 50, IsInlineFrame: true }
+...
 ;--- input.ll
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -84,11 +119,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @foo = global i8 2
 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
 @var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
 
 define i32 @func() {
   %a = load i32, ptr @var1
   %b = load i32, ptr @var2.llvm.125
-  %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b)
+  %c = load i32, ptr @llvm.fake_var
+  %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b, i32 %c)
   ret i32 %ret
 }
 
@@ -108,5 +146,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @foo = global i8 2
 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
 @var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
+
 
 attributes #0 = { "rodata-section"="sec2" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
index c5f72f2..fded7a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,21 +4,9 @@
 define i32 @crash_reordering_undefs() {
 ; CHECK-LABEL: @crash_reordering_undefs(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT:    [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT:    [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT:    [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
+; CHECK-NEXT:    [[ADD0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> splat (i32 65537))
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
-; CHECK-NEXT:    ret i32 [[OP_RDX3]]
+; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
 entry:
   %or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
index 3ac0d01..13b050d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
@@ -6,15 +6,15 @@ define i1 @test(i32 %g, i16 %d) {
 ; CHECK-SAME: i32 [[G:%.*]], i16 [[D:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and i16 [[D]], 1
-; CHECK-NEXT:    [[XOR_I_I:%.*]] = xor i32 [[G]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i32> [[TMP2]], <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[TMP5]], <i8 -9, i8 -9, i8 -1, i8 -1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], splat (i8 -3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
index f07424f..43302f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
@@ -3,32 +3,7 @@
 
 define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <24 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 3, i32 3, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 7, i32 7, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i32> [[TMP9]], <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i32> [[TMP10]], <64 x i32> [[TMP12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 67, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <64 x i1> [[TMP21]] to <64 x i8>
-; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> [[TMP22]])
+; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext i8 [[TMP23]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP24]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 1fedde4..3e9bd78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -3,12 +3,8 @@
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
-; CHECK-NEXT:    [[XOR108_I_I_I:%.*]] = xor i64 0, 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i64> poison, i64 1, i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
index 034fe82..c5442b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
@@ -6,11 +6,10 @@
 define void @foo() {
 ; CHECK-LABEL: define void @foo() {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
@@ -24,11 +23,10 @@ define void @foo() {
 ;
 ; FORCED-LABEL: define void @foo() {
 ; FORCED-NEXT:  bb:
-; FORCED-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
 ; FORCED-NEXT:    br label [[BB1:%.*]]
 ; FORCED:       bb1:
 ; FORCED-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; FORCED-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; FORCED-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
 ; FORCED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; FORCED-NEXT:    [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
 ; FORCED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
index 2612a21..e8078ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
@@ -5,23 +5,22 @@ define i32 @test(i1 %cond) {
 ; CHECK-LABEL: define i32 @test(
 ; CHECK-SAME: i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[OR92:%.*]] = or i32 1, 0
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
-; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OR92:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OR92]], %[[BB]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, <4 x i32> <i32 poison, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[P1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[OR92]] = or i32 1, 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[OR92]], i32 0
-; CHECK-NEXT:    [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]]
+; CHECK-NEXT:    [[TMP8]] = xor <2 x i32> [[TMP9]], <i32 1, i32 0>
 ; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 4a5dd2a..b9f8390 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -8,42 +8,21 @@ define i16 @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 0, 0
-; CHECK-NEXT:    [[UNSCLEAR186_I:%.*]] = and i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i32 0, 0
 ; CHECK-NEXT:    [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 0, 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <2 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <24 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <24 x i32> [[TMP16]], <24 x i32> <i32 0, i32 1, i32 24, i32 25, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <24 x i32> [[TMP17]], <24 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 24, i32 5, i32 26, i32 7, i32 28, i32 29, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <24 x i32> [[TMP18]], i32 [[UNSCLEAR186_I]], i32 10
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <24 x i32> [[TMP19]], <24 x i32> [[TMP20]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <24 x i32> [[TMP21]], <24 x i32> [[TMP22]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 25, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <24 x i32> [[TMP23]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 24, i32 25, i32 26, i32 27, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <24 x i32> [[TMP24]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 4, i32 30, i32 6, i32 32, i32 33, i32 34, i32 poison, i32 36, i32 37, i32 38, i32 poison, i32 40, i32 poison, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <24 x i32> [[TMP25]], i32 [[UNSCLEAR186_I]], i32 11
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <24 x i32> [[TMP26]], <24 x i32> [[TMP27]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24, i32 16, i32 26, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <24 x i32> [[TMP24]], [[TMP28]]
-; CHECK-NEXT:    [[RDX_OP:%.*]] = shufflevector <24 x i1> [[TMP29]], <24 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <28 x i1> [[RDX_OP]] to i28
-; CHECK-NEXT:    [[TMP31:%.*]] = call i28 @llvm.ctpop.i28(i28 [[TMP30]])
-; CHECK-NEXT:    [[TMP32:%.*]] = trunc i28 [[TMP31]] to i16
-; CHECK-NEXT:    [[TMP33:%.*]] = call i4 @llvm.ctpop.i4(i4 -8)
-; CHECK-NEXT:    [[TMP34:%.*]] = zext i4 [[TMP33]] to i16
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = add i16 [[TMP34]], [[TMP32]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <28 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>, i32 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <28 x i32> [[TMP4]], i32 [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <28 x i32> [[TMP5]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 28, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <28 x i32> [[TMP6]], i32 [[TMP8]], i32 12
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <28 x i32> [[TMP7]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <28 x i32> [[TMP16]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 28, i32 29, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <28 x i32> [[TMP9]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 28, i32 29, i32 30, i32 31, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP11:%.*]] = and <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <28 x i32> [[TMP11]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i1> [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP14]])
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-NEXT:    ret i16 [[OP_RDX4]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
index a7f8629..78708a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
@@ -6,20 +6,12 @@ define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr null, align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 0, 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i32 0, 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 false, i32 0, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> <i8 0, i8 poison, i8 poison, i8 poison>, i8 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> zeroinitializer, zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 0, [[TMP14]]
 ; CHECK-NEXT:    store i32 [[OP_RDX]], ptr null, align 4
diff --git a/llvm/test/Verifier/llvm.used-invalid-init.ll b/llvm/test/Verifier/llvm.used-invalid-init.ll
index 15a961c..38c84b15 100644
--- a/llvm/test/Verifier/llvm.used-invalid-init.ll
+++ b/llvm/test/Verifier/llvm.used-invalid-init.ll
@@ -2,5 +2,5 @@
 
 @llvm.used = appending global [1 x ptr] zeroinitializer, section "llvm.metadata"
 
-; CHECK: wrong initalizer for intrinsic global variable
+; CHECK: wrong initializer for intrinsic global variable
 ; CHECK-NEXT: [1 x ptr] zeroinitializer
diff --git a/llvm/tools/bugpoint/BugDriver.cpp b/llvm/tools/bugpoint/BugDriver.cpp
index 2bdfebe..a7e93f6 100644
--- a/llvm/tools/bugpoint/BugDriver.cpp
+++ b/llvm/tools/bugpoint/BugDriver.cpp
@@ -27,9 +27,7 @@
 #include <memory>
 using namespace llvm;
 
-namespace llvm {
-Triple TargetTriple;
-}
+Triple llvm::TargetTriple;
 
 DiscardTemp::~DiscardTemp() {
   if (SaveTemps) {
@@ -41,18 +39,14 @@ DiscardTemp::~DiscardTemp() {
     errs() << "Failed to delete temp file " << toString(std::move(E)) << '\n';
 }
 
-// Anonymous namespace to define command line options for debugging.
-//
-namespace {
 // Output - The user can specify a file containing the expected output of the
 // program.  If this filename is set, it is used as the reference diff source,
 // otherwise the raw input run through an interpreter is used as the reference
 // source.
 //
-cl::opt<std::string> OutputFile("output",
-                                cl::desc("Specify a reference program output "
-                                         "(for miscompilation detection)"));
-}
+static cl::opt<std::string>
+    OutputFile("output", cl::desc("Specify a reference program output "
+                                  "(for miscompilation detection)"));
 
 /// If we reduce or update the program somehow, call this method to update
 /// bugdriver with it.  This deletes the old module and sets the specified one
@@ -238,7 +232,7 @@ Error BugDriver::run() {
   return Error::success();
 }
 
-void llvm::PrintFunctionList(const std::vector<Function *> &Funcs) {
+void llvm::printFunctionList(const std::vector<Function *> &Funcs) {
   unsigned NumPrint = Funcs.size();
   if (NumPrint > 10)
     NumPrint = 10;
@@ -249,7 +243,7 @@ void llvm::PrintFunctionList(const std::vector<Function *> &Funcs) {
   outs().flush();
 }
 
-void llvm::PrintGlobalVariableList(const std::vector<GlobalVariable *> &GVs) {
+void llvm::printGlobalVariableList(const std::vector<GlobalVariable *> &GVs) {
   unsigned NumPrint = GVs.size();
   if (NumPrint > 10)
     NumPrint = 10;
diff --git a/llvm/tools/bugpoint/BugDriver.h b/llvm/tools/bugpoint/BugDriver.h
index e3117ec..ca57405 100644
--- a/llvm/tools/bugpoint/BugDriver.h
+++ b/llvm/tools/bugpoint/BugDriver.h
@@ -57,7 +57,6 @@ class BugDriver {
 
   // FIXME: sort out public/private distinctions...
   friend class ReducePassList;
-  friend class ReduceMisCodegenFunctions;
 
 public:
   BugDriver(const char *toolname, bool find_bugs, unsigned timeout,
@@ -76,7 +75,7 @@ public:
   void setPassesToRun(const std::vector<std::string> &PTR) {
     PassesToRun = PTR;
   }
-  const std::vector<std::string> &getPassesToRun() const { return PassesToRun; }
+  ArrayRef<std::string> getPassesToRun() const { return PassesToRun; }
 
   /// run - The top level method that is invoked after all of the instance
   /// variables are set up from command line arguments. The \p as_child argument
@@ -111,7 +110,6 @@ public:
   Error debugCodeGenerator();
 
   /// isExecutingJIT - Returns true if bugpoint is currently testing the JIT
-  ///
   bool isExecutingJIT();
 
   Module &getProgram() const { return *Program; }
@@ -167,7 +165,7 @@ public:
                              bool RemoveBitcode = false) const;
 
   /// This function is used to output M to a file named "bugpoint-ID.bc".
-  void EmitProgressBitcode(const Module &M, const std::string &ID,
+  void emitProgressBitcode(const Module &M, const std::string &ID,
                            bool NoFlyer = false) const;
 
   /// This method clones the current Program and deletes the specified
@@ -214,7 +212,6 @@ public:
   /// outs() a single line message indicating whether compilation was successful
   /// or failed, unless Quiet is set.  ExtraArgs specifies additional arguments
   /// to pass to the child bugpoint instance.
-  ///
   bool runPasses(Module &Program, const std::vector<std::string> &PassesToRun,
                  std::string &OutputFilename, bool DeleteOutput = false,
                  bool Quiet = false,
@@ -223,7 +220,6 @@ public:
   /// runPasses - Just like the method above, but this just returns true or
   /// false indicating whether or not the optimizer crashed on the specified
   /// input (true = crashed).  Does not produce any output.
-  ///
   bool runPasses(Module &M, const std::vector<std::string> &PassesToRun) const {
     std::string Filename;
     return runPasses(M, PassesToRun, Filename, true);
@@ -247,7 +243,6 @@ public:
 private:
   /// initializeExecutionEnvironment - This method is used to set up the
   /// environment for executing LLVM programs.
-  ///
   Error initializeExecutionEnvironment();
 };
 
@@ -258,37 +253,31 @@ struct DiscardTemp {
 
 ///  Given a bitcode or assembly input filename, parse and return it, or return
 ///  null if not possible.
-///
 std::unique_ptr<Module> parseInputFile(StringRef InputFilename,
                                        LLVMContext &ctxt);
 
 /// getPassesString - Turn a list of passes into a string which indicates the
 /// command line options that must be passed to add the passes.
-///
 std::string getPassesString(const std::vector<std::string> &Passes);
 
-/// PrintFunctionList - prints out list of problematic functions
-///
-void PrintFunctionList(const std::vector<Function *> &Funcs);
+/// Prints out list of problematic functions
+void printFunctionList(const std::vector<Function *> &Funcs);
 
-/// PrintGlobalVariableList - prints out list of problematic global variables
-///
-void PrintGlobalVariableList(const std::vector<GlobalVariable *> &GVs);
+/// Prints out list of problematic global variables
+void printGlobalVariableList(const std::vector<GlobalVariable *> &GVs);
 
-// DeleteGlobalInitializer - "Remove" the global variable by deleting its
-// initializer, making it external.
-//
-void DeleteGlobalInitializer(GlobalVariable *GV);
+/// "Remove" the global variable by deleting its initializer, making it
+/// external.
+void deleteGlobalInitializer(GlobalVariable *GV);
 
-// DeleteFunctionBody - "Remove" the function by deleting all of it's basic
-// blocks, making it external.
-//
-void DeleteFunctionBody(Function *F);
+/// "Remove" the function by deleting all of it's basic blocks, making it
+/// external.
+void deleteFunctionBody(Function *F);
 
 /// Given a module and a list of functions in the module, split the functions
 /// OUT of the specified module, and place them in the new module.
 std::unique_ptr<Module>
-SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+splitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
                           ValueToValueMapTy &VMap);
 
 } // End llvm namespace
diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp
index fcac014..240300b 100644
--- a/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -36,39 +36,44 @@
 #include <set>
 using namespace llvm;
 
-namespace {
-cl::opt<bool> KeepMain("keep-main",
-                       cl::desc("Force function reduction to keep main"),
-                       cl::init(false));
-cl::opt<bool> NoGlobalRM("disable-global-remove",
-                         cl::desc("Do not remove global variables"),
-                         cl::init(false));
-
-cl::opt<bool> NoAttributeRM("disable-attribute-remove",
-                         cl::desc("Do not remove function attributes"),
-                         cl::init(false));
-
-cl::opt<bool> ReplaceFuncsWithNull(
+static cl::opt<bool> KeepMain("keep-main",
+                              cl::desc("Force function reduction to keep main"),
+                              cl::init(false));
+static cl::opt<bool> NoGlobalRM("disable-global-remove",
+                                cl::desc("Do not remove global variables"),
+                                cl::init(false));
+
+static cl::opt<bool>
+    NoAttributeRM("disable-attribute-remove",
+                  cl::desc("Do not remove function attributes"),
+                  cl::init(false));
+
+static cl::opt<bool> ReplaceFuncsWithNull(
     "replace-funcs-with-null",
     cl::desc("When stubbing functions, replace all uses will null"),
     cl::init(false));
-cl::opt<bool> DontReducePassList("disable-pass-list-reduction",
-                                 cl::desc("Skip pass list reduction steps"),
-                                 cl::init(false));
-
-cl::opt<bool> NoNamedMDRM("disable-namedmd-remove",
-                          cl::desc("Do not remove global named metadata"),
-                          cl::init(false));
-cl::opt<bool> NoStripDebugInfo("disable-strip-debuginfo",
-                               cl::desc("Do not strip debug info metadata"),
-                               cl::init(false));
-cl::opt<bool> NoStripDebugTypeInfo("disable-strip-debug-types",
-                               cl::desc("Do not strip debug type info metadata"),
-                               cl::init(false));
-cl::opt<bool> VerboseErrors("verbose-errors",
-                            cl::desc("Print the output of crashing program"),
-                            cl::init(false));
-}
+
+static cl::opt<bool>
+    DontReducePassList("disable-pass-list-reduction",
+                       cl::desc("Skip pass list reduction steps"),
+                       cl::init(false));
+
+static cl::opt<bool>
+    NoNamedMDRM("disable-namedmd-remove",
+                cl::desc("Do not remove global named metadata"),
+                cl::init(false));
+static cl::opt<bool>
+    NoStripDebugInfo("disable-strip-debuginfo",
+                     cl::desc("Do not strip debug info metadata"),
+                     cl::init(false));
+static cl::opt<bool>
+    NoStripDebugTypeInfo("disable-strip-debug-types",
+                         cl::desc("Do not strip debug type info metadata"),
+                         cl::init(false));
+static cl::opt<bool>
+    VerboseErrors("verbose-errors",
+                  cl::desc("Print the output of crashing program"),
+                  cl::init(false));
 
 static bool isValidModule(std::unique_ptr<Module> &M,
                           bool ExitOnFailure = true) {
@@ -83,6 +88,8 @@ static bool isValidModule(std::unique_ptr<Module> &M,
 }
 
 namespace llvm {
+// Note this class needs to be in llvm namespace since its declared as a friend
+// of BugDriver.
 class ReducePassList : public ListReducer<std::string> {
   BugDriver &BD;
 
@@ -95,7 +102,7 @@ public:
   Expected<TestResult> doTest(std::vector<std::string> &Removed,
                               std::vector<std::string> &Kept) override;
 };
-}
+} // namespace llvm
 
 Expected<ReducePassList::TestResult>
 ReducePassList::doTest(std::vector<std::string> &Prefix,
@@ -156,7 +163,7 @@ public:
 
   bool TestGlobalVariables(std::vector<GlobalVariable *> &GVs);
 };
-}
+} // namespace
 
 bool ReduceCrashingGlobalInitializers::TestGlobalVariables(
     std::vector<GlobalVariable *> &GVs) {
@@ -174,14 +181,14 @@ bool ReduceCrashingGlobalInitializers::TestGlobalVariables(
   }
 
   outs() << "Checking for crash with only these global variables: ";
-  PrintGlobalVariableList(GVs);
+  printGlobalVariableList(GVs);
   outs() << ": ";
 
   // Loop over and delete any global variables which we aren't supposed to be
   // playing with...
   for (GlobalVariable &I : M->globals())
     if (I.hasInitializer() && !GVSet.count(&I)) {
-      DeleteGlobalInitializer(&I);
+      deleteGlobalInitializer(&I);
       I.setLinkage(GlobalValue::ExternalLinkage);
       I.setComdat(nullptr);
     }
@@ -223,7 +230,7 @@ public:
 
   bool TestFuncs(std::vector<Function *> &Prefix);
 };
-}
+} // namespace
 
 static void RemoveFunctionReferences(Module *M, const char *Name) {
   auto *UsedVar = M->getGlobalVariable(Name, true);
@@ -269,14 +276,14 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
   }
 
   outs() << "Checking for crash with only these functions: ";
-  PrintFunctionList(Funcs);
+  printFunctionList(Funcs);
   outs() << ": ";
   if (!ReplaceFuncsWithNull) {
     // Loop over and delete any functions which we aren't supposed to be playing
     // with...
     for (Function &I : *M)
       if (!I.isDeclaration() && !Functions.count(&I))
-        DeleteFunctionBody(&I);
+        deleteFunctionBody(&I);
   } else {
     std::vector<GlobalValue *> ToRemove;
     // First, remove aliases to functions we're about to purge.
@@ -356,7 +363,7 @@ public:
 
   bool TestFuncAttrs(std::vector<Attribute> &Attrs);
 };
-}
+} // namespace
 
 bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
     std::vector<Attribute> &Attrs) {
@@ -396,12 +403,11 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
   return false;
 }
 
-namespace {
 /// Simplify the CFG without completely destroying it.
 /// This is not well defined, but basically comes down to "try to eliminate
 /// unreachable blocks and constant fold terminators without deciding that
 /// certain undefined behavior cuts off the program at the legs".
-void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
+static void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
   if (F.empty())
     return;
 
@@ -435,6 +441,8 @@ void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
   for (auto *BB : Unreachable)
     BB->eraseFromParent();
 }
+
+namespace {
 /// ReduceCrashingBlocks reducer - This works by setting the terminators of
 /// all terminators except the specified basic blocks to a 'ret' instruction,
 /// then running the simplifycfg pass.  This has the effect of chopping up
@@ -459,7 +467,7 @@ public:
 
   bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
 };
-}
+} // namespace
 
 bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
   // Clone the program to try hacking it apart...
@@ -571,7 +579,7 @@ public:
 
   bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
 };
-}
+} // namespace
 
 bool ReduceCrashingConditionals::TestBlocks(
     std::vector<const BasicBlock *> &BBs) {
@@ -670,7 +678,7 @@ public:
 
   bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
 };
-}
+} // namespace
 
 bool ReduceSimplifyCFG::TestBlocks(std::vector<const BasicBlock *> &BBs) {
   // Clone the program to try hacking it apart...
@@ -755,7 +763,7 @@ public:
 
   bool TestInsts(std::vector<const Instruction *> &Prefix);
 };
-}
+} // namespace
 
 bool ReduceCrashingInstructions::TestInsts(
     std::vector<const Instruction *> &Insts) {
@@ -896,7 +904,7 @@ public:
 
   bool TestNamedMDs(std::vector<std::string> &NamedMDs);
 };
-}
+} // namespace
 
 bool ReduceCrashingNamedMD::TestNamedMDs(std::vector<std::string> &NamedMDs) {
 
@@ -959,7 +967,7 @@ public:
 
   bool TestNamedMDOps(std::vector<const MDNode *> &NamedMDOps);
 };
-}
+} // namespace
 
 bool ReduceCrashingNamedMDOps::TestNamedMDOps(
     std::vector<const MDNode *> &NamedMDOps) {
@@ -1018,7 +1026,7 @@ static Error ReduceGlobalInitializers(BugDriver &BD, BugTester TestFn) {
 
   for (GlobalVariable &GV : M->globals()) {
     if (GV.hasInitializer()) {
-      DeleteGlobalInitializer(&GV);
+      deleteGlobalInitializer(&GV);
       GV.setLinkage(GlobalValue::ExternalLinkage);
       GV.setComdat(nullptr);
       DeletedInit = true;
@@ -1056,7 +1064,7 @@ static Error ReduceGlobalInitializers(BugDriver &BD, BugTester TestFn) {
       return E;
 
     if (GVs.size() < OldSize)
-      BD.EmitProgressBitcode(BD.getProgram(), "reduced-global-variables");
+      BD.emitProgressBitcode(BD.getProgram(), "reduced-global-variables");
   }
   return Error::success();
 }
@@ -1155,7 +1163,7 @@ static Error ReduceInsts(BugDriver &BD, BugTester TestFn) {
       return E;
   }
 
-  BD.EmitProgressBitcode(BD.getProgram(), "reduced-instructions");
+  BD.emitProgressBitcode(BD.getProgram(), "reduced-instructions");
   return Error::success();
 }
 
@@ -1186,7 +1194,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
       return E;
 
     if (Functions.size() < OldSize)
-      BD.EmitProgressBitcode(BD.getProgram(), "reduced-function");
+      BD.emitProgressBitcode(BD.getProgram(), "reduced-function");
   }
 
   if (!NoAttributeRM) {
@@ -1218,7 +1226,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
       }
 
       if (OldSize < NewSize)
-        BD.EmitProgressBitcode(BD.getProgram(), "reduced-function-attributes");
+        BD.emitProgressBitcode(BD.getProgram(), "reduced-function-attributes");
     }
   }
 
@@ -1238,7 +1246,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
     if (Error E = Result.takeError())
       return E;
     if (Blocks.size() < OldSize)
-      BD.EmitProgressBitcode(BD.getProgram(), "reduced-conditionals");
+      BD.emitProgressBitcode(BD.getProgram(), "reduced-conditionals");
   }
 
   // Attempt to delete entire basic blocks at a time to speed up
@@ -1256,7 +1264,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
     if (Error E = Result.takeError())
       return E;
     if (Blocks.size() < OldSize)
-      BD.EmitProgressBitcode(BD.getProgram(), "reduced-blocks");
+      BD.emitProgressBitcode(BD.getProgram(), "reduced-blocks");
   }
 
   if (!DisableSimplifyCFG && !BugpointIsInterrupted) {
@@ -1269,7 +1277,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
     if (Error E = Result.takeError())
       return E;
     if (Blocks.size() < OldSize)
-      BD.EmitProgressBitcode(BD.getProgram(), "reduced-simplifycfg");
+      BD.emitProgressBitcode(BD.getProgram(), "reduced-simplifycfg");
   }
 
   // Attempt to delete instructions using bisection. This should help out nasty
@@ -1319,7 +1327,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
       if (Error E = Result.takeError())
         return E;
     }
-    BD.EmitProgressBitcode(BD.getProgram(), "reduced-named-md");
+    BD.emitProgressBitcode(BD.getProgram(), "reduced-named-md");
   }
 
   // Try to clean up the testcase by running funcresolve and globaldce...
@@ -1334,7 +1342,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
           std::move(M)); // Yup, it does, keep the reduced version...
   }
 
-  BD.EmitProgressBitcode(BD.getProgram(), "reduced-simplified");
+  BD.emitProgressBitcode(BD.getProgram(), "reduced-simplified");
 
   return Error::success();
 }
@@ -1361,7 +1369,7 @@ Error BugDriver::debugOptimizerCrash(const std::string &ID) {
          << (PassesToRun.size() == 1 ? ": " : "es: ")
          << getPassesString(PassesToRun) << '\n';
 
-  EmitProgressBitcode(*Program, ID);
+  emitProgressBitcode(*Program, ID);
 
   auto Res = DebugACrash(*this, TestForOptimizerCrash);
   if (Res || DontReducePassList)
@@ -1376,7 +1384,7 @@ Error BugDriver::debugOptimizerCrash(const std::string &ID) {
          << (PassesToRun.size() == 1 ? ": " : "es: ")
          << getPassesString(PassesToRun) << '\n';
 
-  EmitProgressBitcode(getProgram(), "reduced-simplified");
+  emitProgressBitcode(getProgram(), "reduced-simplified");
   return Res;
 }
 
diff --git a/llvm/tools/bugpoint/ExecutionDriver.cpp b/llvm/tools/bugpoint/ExecutionDriver.cpp
index 165b55f..8c6b7fb 100644
--- a/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -36,15 +36,16 @@ enum OutputType {
   CompileCustom,
   Custom
 };
+} // namespace
 
-cl::opt<double> AbsTolerance("abs-tolerance",
-                             cl::desc("Absolute error tolerated"),
-                             cl::init(0.0));
-cl::opt<double> RelTolerance("rel-tolerance",
-                             cl::desc("Relative error tolerated"),
-                             cl::init(0.0));
+static cl::opt<double> AbsTolerance("abs-tolerance",
+                                    cl::desc("Absolute error tolerated"),
+                                    cl::init(0.0));
+static cl::opt<double> RelTolerance("rel-tolerance",
+                                    cl::desc("Relative error tolerated"),
+                                    cl::init(0.0));
 
-cl::opt<OutputType> InterpreterSel(
+static cl::opt<OutputType> InterpreterSel(
     cl::desc("Specify the \"test\" i.e. suspect back-end:"),
     cl::values(clEnumValN(AutoPick, "auto", "Use best guess"),
                clEnumValN(RunLLI, "run-int", "Execute with the interpreter"),
@@ -60,7 +61,7 @@ cl::opt<OutputType> InterpreterSel(
                           "the bitcode. Useful for cross-compilation.")),
     cl::init(AutoPick));
 
-cl::opt<OutputType> SafeInterpreterSel(
+static cl::opt<OutputType> SafeInterpreterSel(
     cl::desc("Specify \"safe\" i.e. known-good backend:"),
     cl::values(clEnumValN(AutoPick, "safe-auto", "Use best guess"),
                clEnumValN(RunLLC, "safe-run-llc", "Compile with LLC"),
@@ -69,16 +70,16 @@ cl::opt<OutputType> SafeInterpreterSel(
                           "the bitcode. Useful for cross-compilation.")),
     cl::init(AutoPick));
 
-cl::opt<std::string> SafeInterpreterPath(
+static cl::opt<std::string> SafeInterpreterPath(
     "safe-path", cl::desc("Specify the path to the \"safe\" backend program"),
     cl::init(""));
 
-cl::opt<bool> AppendProgramExitCode(
+static cl::opt<bool> AppendProgramExitCode(
     "append-exit-code",
     cl::desc("Append the exit code to the output so it gets diff'd too"),
     cl::init(false));
 
-cl::opt<std::string>
+static cl::opt<std::string>
     InputFile("input", cl::init("/dev/null"),
               cl::desc("Filename to pipe in as stdin (default: /dev/null)"));
 
@@ -89,20 +90,19 @@ static cl::list<std::string>
 static cl::list<std::string> AdditionalLinkerArgs(
     "Xlinker", cl::desc("Additional arguments to pass to the linker"));
 
-cl::opt<std::string> CustomCompileCommand(
+static cl::opt<std::string> CustomCompileCommand(
     "compile-command", cl::init("llc"),
     cl::desc("Command to compile the bitcode (use with -compile-custom) "
              "(default: llc)"));
 
-cl::opt<std::string> CustomExecCommand(
+static cl::opt<std::string> CustomExecCommand(
     "exec-command", cl::init("simulate"),
     cl::desc("Command to execute the bitcode (use with -run-custom) "
              "(default: simulate)"));
-}
 
-namespace llvm {
 // Anything specified after the --args option are taken as arguments to the
 // program being debugged.
+namespace llvm {
 cl::list<std::string> InputArgv("args", cl::Positional,
                                 cl::desc("<program arguments>..."),
                                 cl::PositionalEatsArgs);
@@ -110,25 +110,22 @@ cl::list<std::string> InputArgv("args", cl::Positional,
 cl::opt<std::string>
     OutputPrefix("output-prefix", cl::init("bugpoint"),
                  cl::desc("Prefix to use for outputs (default: 'bugpoint')"));
-}
-
-namespace {
-cl::list<std::string> ToolArgv("tool-args", cl::Positional,
-                               cl::desc("<tool arguments>..."),
-                               cl::PositionalEatsArgs);
+} // namespace llvm
 
-cl::list<std::string> SafeToolArgv("safe-tool-args", cl::Positional,
-                                   cl::desc("<safe-tool arguments>..."),
-                                   cl::PositionalEatsArgs);
+static cl::list<std::string> ToolArgv("tool-args", cl::Positional,
+                                      cl::desc("<tool arguments>..."),
+                                      cl::PositionalEatsArgs);
 
-cl::opt<std::string> CCBinary("gcc", cl::init(""),
-                              cl::desc("The gcc binary to use."));
+static cl::list<std::string> SafeToolArgv("safe-tool-args", cl::Positional,
+                                          cl::desc("<safe-tool arguments>..."),
+                                          cl::PositionalEatsArgs);
 
-cl::list<std::string> CCToolArgv("gcc-tool-args", cl::Positional,
-                                 cl::desc("<gcc-tool arguments>..."),
-                                 cl::PositionalEatsArgs);
-}
+static cl::opt<std::string> CCBinary("gcc", cl::init(""),
+                                     cl::desc("The gcc binary to use."));
 
+static cl::list<std::string> CCToolArgv("gcc-tool-args", cl::Positional,
+                                        cl::desc("<gcc-tool arguments>..."),
+                                        cl::PositionalEatsArgs);
 //===----------------------------------------------------------------------===//
 // BugDriver method implementation
 //
diff --git a/llvm/tools/bugpoint/ExtractFunction.cpp b/llvm/tools/bugpoint/ExtractFunction.cpp
index dd9a82c..3206589 100644
--- a/llvm/tools/bugpoint/ExtractFunction.cpp
+++ b/llvm/tools/bugpoint/ExtractFunction.cpp
@@ -35,19 +35,19 @@ using namespace llvm;
 
 #define DEBUG_TYPE "bugpoint"
 
+bool llvm::DisableSimplifyCFG = false;
 namespace llvm {
-bool DisableSimplifyCFG = false;
 extern cl::opt<std::string> OutputPrefix;
-} // End llvm namespace
+} // namespace llvm
 
-namespace {
-cl::opt<bool> NoDCE("disable-dce",
-                    cl::desc("Do not use the -dce pass to reduce testcases"));
-cl::opt<bool, true>
+static cl::opt<bool>
+    NoDCE("disable-dce",
+          cl::desc("Do not use the -dce pass to reduce testcases"));
+static cl::opt<bool, true>
     NoSCFG("disable-simplifycfg", cl::location(DisableSimplifyCFG),
            cl::desc("Do not use the -simplifycfg pass to reduce testcases"));
 
-Function *globalInitUsesExternalBA(GlobalVariable *GV) {
+static Function *globalInitUsesExternalBA(GlobalVariable *GV) {
   if (!GV->hasInitializer())
     return nullptr;
 
@@ -78,7 +78,6 @@ Function *globalInitUsesExternalBA(GlobalVariable *GV) {
   }
   return nullptr;
 }
-} // end anonymous namespace
 
 std::unique_ptr<Module>
 BugDriver::deleteInstructionFromProgram(const Instruction *I,
@@ -154,7 +153,7 @@ std::unique_ptr<Module> BugDriver::extractLoop(Module *M) {
   std::unique_ptr<Module> NewM = runPassesOn(M, LoopExtractPasses);
   if (!NewM) {
     outs() << "*** Loop extraction failed: ";
-    EmitProgressBitcode(*M, "loopextraction", true);
+    emitProgressBitcode(*M, "loopextraction", true);
     outs() << "*** Sorry. :(  Please report a bug!\n";
     return nullptr;
   }
@@ -198,21 +197,16 @@ static void eliminateAliases(GlobalValue *GV) {
   }
 }
 
-//
-// DeleteGlobalInitializer - "Remove" the global variable by deleting its
-// initializer,
-// making it external.
-//
-void llvm::DeleteGlobalInitializer(GlobalVariable *GV) {
+// "Remove" the global variable by deleting its initializer, making it external.
+void llvm::deleteGlobalInitializer(GlobalVariable *GV) {
   eliminateAliases(GV);
   GV->setInitializer(nullptr);
   GV->setComdat(nullptr);
 }
 
-// DeleteFunctionBody - "Remove" the function by deleting all of its basic
-// blocks, making it external.
-//
-void llvm::DeleteFunctionBody(Function *F) {
+// "Remove" the function by deleting all of its basic blocks, making it
+// external.
+void llvm::deleteFunctionBody(Function *F) {
   eliminateAliases(F);
   // Function declarations can't have comdats.
   F->setComdat(nullptr);
@@ -222,9 +216,9 @@ void llvm::DeleteFunctionBody(Function *F) {
   assert(F->isDeclaration() && "This didn't make the function external!");
 }
 
-/// GetTorInit - Given a list of entries for static ctors/dtors, return them
+/// getTorInit - Given a list of entries for static ctors/dtors, return them
 /// as a constant array.
-static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
+static Constant *getTorInit(std::vector<std::pair<Function *, int>> &TorList) {
   assert(!TorList.empty() && "Don't create empty tor list!");
   std::vector<Constant *> ArrayElts;
   Type *Int32Ty = Type::getInt32Ty(TorList[0].first->getContext());
@@ -239,11 +233,11 @@ static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
       ArrayType::get(ArrayElts[0]->getType(), ArrayElts.size()), ArrayElts);
 }
 
-/// SplitStaticCtorDtor - A module was recently split into two parts, M1/M2, and
+/// splitStaticCtorDtor - A module was recently split into two parts, M1/M2, and
 /// M1 has all of the global variables.  If M2 contains any functions that are
 /// static ctors/dtors, we need to add an llvm.global_[cd]tors global to M2, and
 /// prune appropriate entries out of M1s list.
-static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
+static void splitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
                                 ValueToValueMapTy &VMap) {
   GlobalVariable *GV = M1->getNamedGlobal(GlobalName);
   if (!GV || GV->isDeclaration() || GV->hasLocalLinkage() || !GV->use_empty())
@@ -284,7 +278,7 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
 
   GV->eraseFromParent();
   if (!M1Tors.empty()) {
-    Constant *M1Init = GetTorInit(M1Tors);
+    Constant *M1Init = getTorInit(M1Tors);
     new GlobalVariable(*M1, M1Init->getType(), false,
                        GlobalValue::AppendingLinkage, M1Init, GlobalName);
   }
@@ -295,14 +289,14 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
 
   GV->eraseFromParent();
   if (!M2Tors.empty()) {
-    Constant *M2Init = GetTorInit(M2Tors);
+    Constant *M2Init = getTorInit(M2Tors);
     new GlobalVariable(*M2, M2Init->getType(), false,
                        GlobalValue::AppendingLinkage, M2Init, GlobalName);
   }
 }
 
 std::unique_ptr<Module>
-llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+llvm::splitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
                                 ValueToValueMapTy &VMap) {
   // Make sure functions & globals are all external so that linkage
   // between the two modules will work.
@@ -326,13 +320,13 @@ llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
     LLVM_DEBUG(TNOF->printAsOperand(errs(), false));
     LLVM_DEBUG(errs() << "\n");
     TestFunctions.insert(cast<Function>(NewVMap[TNOF]));
-    DeleteFunctionBody(TNOF); // Function is now external in this module!
+    deleteFunctionBody(TNOF); // Function is now external in this module!
   }
 
   // Remove the Safe functions from the Test module
   for (Function &I : *New)
     if (!TestFunctions.count(&I))
-      DeleteFunctionBody(&I);
+      deleteFunctionBody(&I);
 
   // Try to split the global initializers evenly
   for (GlobalVariable &I : M->globals()) {
@@ -348,17 +342,17 @@ llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
                << TestFn->getName() << "'.\n";
         exit(1);
       }
-      DeleteGlobalInitializer(&I); // Delete the initializer to make it external
+      deleteGlobalInitializer(&I); // Delete the initializer to make it external
     } else {
       // If we keep it in the safe module, then delete it in the test module
-      DeleteGlobalInitializer(GV);
+      deleteGlobalInitializer(GV);
     }
   }
 
   // Make sure that there is a global ctor/dtor array in both halves of the
   // module if they both have static ctor/dtor functions.
-  SplitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
-  SplitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
+  splitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
+  splitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
 
   return New;
 }
@@ -375,7 +369,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
     outs() << "*** Basic Block extraction failed!\n";
     errs() << "Error creating temporary file: " << toString(Temp.takeError())
            << "\n";
-    EmitProgressBitcode(*M, "basicblockextractfail", true);
+    emitProgressBitcode(*M, "basicblockextractfail", true);
     return nullptr;
   }
   DiscardTemp Discard{*Temp};
@@ -399,7 +393,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
   OS.flush();
   if (OS.has_error()) {
     errs() << "Error writing list of blocks to not extract\n";
-    EmitProgressBitcode(*M, "basicblockextractfail", true);
+    emitProgressBitcode(*M, "basicblockextractfail", true);
     OS.clear_error();
     return nullptr;
   }
@@ -413,7 +407,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
 
   if (!Ret) {
     outs() << "*** Basic Block extraction failed, please report a bug!\n";
-    EmitProgressBitcode(*M, "basicblockextractfail", true);
+    emitProgressBitcode(*M, "basicblockextractfail", true);
   }
   return Ret;
 }
diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp
index 4cf7de3..a7f1643 100644
--- a/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/llvm/tools/bugpoint/Miscompilation.cpp
@@ -33,16 +33,16 @@ extern cl::opt<std::string> OutputPrefix;
 extern cl::list<std::string> InputArgv;
 } // end namespace llvm
 
-namespace {
-static llvm::cl::opt<bool> DisableLoopExtraction(
+static cl::opt<bool> DisableLoopExtraction(
     "disable-loop-extraction",
     cl::desc("Don't extract loops when searching for miscompilations"),
     cl::init(false));
-static llvm::cl::opt<bool> DisableBlockExtraction(
+static cl::opt<bool> DisableBlockExtraction(
     "disable-block-extraction",
     cl::desc("Don't extract blocks when searching for miscompilations"),
     cl::init(false));
 
+namespace {
 class ReduceMiscompilingPasses : public ListReducer<std::string> {
   BugDriver &BD;
 
@@ -71,7 +71,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
     errs() << " Error running this sequence of passes"
            << " on the input program!\n";
     BD.setPassesToRun(Suffix);
-    BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+    BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
     // TODO: This should propagate the error instead of exiting.
     if (Error E = BD.debugOptimizerCrash())
       exit(1);
@@ -113,7 +113,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
     errs() << " Error running this sequence of passes"
            << " on the input program!\n";
     BD.setPassesToRun(Prefix);
-    BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+    BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
     // TODO: This should propagate the error instead of exiting.
     if (Error E = BD.debugOptimizerCrash())
       exit(1);
@@ -158,7 +158,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
     errs() << " Error running this sequence of passes"
            << " on the input program!\n";
     BD.setPassesToRun(Suffix);
-    BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+    BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
     // TODO: This should propagate the error instead of exiting.
     if (Error E = BD.debugOptimizerCrash())
       exit(1);
@@ -253,7 +253,7 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
          << (Funcs.size() == 1 ? "this function is" : "these functions are")
          << " run through the pass"
          << (BD.getPassesToRun().size() == 1 ? "" : "es") << ":";
-  PrintFunctionList(Funcs);
+  printFunctionList(Funcs);
   outs() << '\n';
 
   // Create a clone for two reasons:
@@ -277,7 +277,7 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
   VMap.clear();
   std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
   std::unique_ptr<Module> ToOptimize =
-      SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
+      splitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
 
   Expected<bool> Broken =
       TestFn(BD, std::move(ToOptimize), std::move(ToNotOptimize));
@@ -314,7 +314,7 @@ ExtractLoops(BugDriver &BD,
 
     ValueToValueMapTy VMap;
     std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
-    std::unique_ptr<Module> ToOptimize = SplitFunctionsOutOfModule(
+    std::unique_ptr<Module> ToOptimize = splitFunctionsOutOfModule(
         ToNotOptimize.get(), MiscompiledFunctions, VMap);
     std::unique_ptr<Module> ToOptimizeLoopExtracted =
         BD.extractLoop(ToOptimize.get());
@@ -517,7 +517,7 @@ ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock *> &BBs) {
 
   std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
   std::unique_ptr<Module> ToOptimize =
-      SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
+      splitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
 
   // Try the extraction.  If it doesn't work, then the block extractor crashed
   // or something, in which case bugpoint can't chase down this possibility.
@@ -572,7 +572,7 @@ ExtractBlocks(BugDriver &BD,
   ValueToValueMapTy VMap;
   std::unique_ptr<Module> ProgClone = CloneModule(BD.getProgram(), VMap);
   std::unique_ptr<Module> ToExtract =
-      SplitFunctionsOutOfModule(ProgClone.get(), MiscompiledFunctions, VMap);
+      splitFunctionsOutOfModule(ProgClone.get(), MiscompiledFunctions, VMap);
   std::unique_ptr<Module> Extracted =
       BD.extractMappedBlocksFromModule(Blocks, ToExtract.get());
   if (!Extracted) {
@@ -638,7 +638,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
   outs() << "\n*** The following function"
          << (MiscompiledFunctions.size() == 1 ? " is" : "s are")
          << " being miscompiled: ";
-  PrintFunctionList(MiscompiledFunctions);
+  printFunctionList(MiscompiledFunctions);
   outs() << '\n';
 
   // See if we can rip any loops out of the miscompiled functions and still
@@ -663,7 +663,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
       outs() << "\n*** The following function"
              << (MiscompiledFunctions.size() == 1 ? " is" : "s are")
              << " being miscompiled: ";
-      PrintFunctionList(MiscompiledFunctions);
+      printFunctionList(MiscompiledFunctions);
       outs() << '\n';
     }
   }
@@ -686,7 +686,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
       outs() << "\n*** The following function"
              << (MiscompiledFunctions.size() == 1 ? " is" : "s are")
              << " being miscompiled: ";
-      PrintFunctionList(MiscompiledFunctions);
+      printFunctionList(MiscompiledFunctions);
       outs() << '\n';
     }
   }
@@ -708,7 +708,7 @@ static Expected<bool> TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
   if (!Optimized) {
     errs() << " Error running this sequence of passes"
            << " on the input program!\n";
-    BD.EmitProgressBitcode(*Test, "pass-error", false);
+    BD.emitProgressBitcode(*Test, "pass-error", false);
     BD.setNewProgram(std::move(Test));
     if (Error E = BD.debugOptimizerCrash())
       return std::move(E);
@@ -750,7 +750,7 @@ Error BugDriver::debugMiscompilation() {
   outs() << "\n*** Found miscompiling pass"
          << (getPassesToRun().size() == 1 ? "" : "es") << ": "
          << getPassesString(getPassesToRun()) << '\n';
-  EmitProgressBitcode(*Program, "passinput");
+  emitProgressBitcode(*Program, "passinput");
 
   Expected<std::vector<Function *>> MiscompiledFunctions =
       DebugAMiscompilation(*this, TestOptimizer);
@@ -762,15 +762,15 @@ Error BugDriver::debugMiscompilation() {
   ValueToValueMapTy VMap;
   Module *ToNotOptimize = CloneModule(getProgram(), VMap).release();
   Module *ToOptimize =
-      SplitFunctionsOutOfModule(ToNotOptimize, *MiscompiledFunctions, VMap)
+      splitFunctionsOutOfModule(ToNotOptimize, *MiscompiledFunctions, VMap)
           .release();
 
   outs() << "  Non-optimized portion: ";
-  EmitProgressBitcode(*ToNotOptimize, "tonotoptimize", true);
+  emitProgressBitcode(*ToNotOptimize, "tonotoptimize", true);
   delete ToNotOptimize; // Delete hacked module.
 
   outs() << "  Portion that is input to optimizer: ";
-  EmitProgressBitcode(*ToOptimize, "tooptimize");
+  emitProgressBitcode(*ToOptimize, "tooptimize");
   delete ToOptimize; // Delete hacked module.
 
   return Error::success();
@@ -1028,7 +1028,7 @@ Error BugDriver::debugCodeGenerator() {
   ValueToValueMapTy VMap;
   std::unique_ptr<Module> ToNotCodeGen = CloneModule(getProgram(), VMap);
   std::unique_ptr<Module> ToCodeGen =
-      SplitFunctionsOutOfModule(ToNotCodeGen.get(), *Funcs, VMap);
+      splitFunctionsOutOfModule(ToNotCodeGen.get(), *Funcs, VMap);
 
   // Condition the modules
   ToCodeGen =
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index 3daacfd..bf2e8c0 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -82,7 +82,7 @@ bool BugDriver::writeProgramToFile(const std::string &Filename,
 
 /// This function is used to output the current Program to a file named
 /// "bugpoint-ID.bc".
-void BugDriver::EmitProgressBitcode(const Module &M, const std::string &ID,
+void BugDriver::emitProgressBitcode(const Module &M, const std::string &ID,
                                     bool NoFlyer) const {
   // Output the input to the current pass to a bitcode file, emit a message
   // telling the user how to reproduce it: opt -foo blah.bc
diff --git a/llvm/tools/bugpoint/ToolRunner.cpp b/llvm/tools/bugpoint/ToolRunner.cpp
index f2f5966a..c67695f 100644
--- a/llvm/tools/bugpoint/ToolRunner.cpp
+++ b/llvm/tools/bugpoint/ToolRunner.cpp
@@ -25,29 +25,25 @@ using namespace llvm;
 
 #define DEBUG_TYPE "toolrunner"
 
-namespace llvm {
-cl::opt<bool> SaveTemps("save-temps", cl::init(false),
-                        cl::desc("Save temporary files"));
-}
+cl::opt<bool> llvm::SaveTemps("save-temps", cl::init(false),
+                              cl::desc("Save temporary files"));
 
-namespace {
-cl::opt<std::string>
+static cl::opt<std::string>
     RemoteClient("remote-client",
                  cl::desc("Remote execution client (rsh/ssh)"));
 
-cl::opt<std::string> RemoteHost("remote-host",
-                                cl::desc("Remote execution (rsh/ssh) host"));
+static cl::opt<std::string>
+    RemoteHost("remote-host", cl::desc("Remote execution (rsh/ssh) host"));
 
-cl::opt<std::string> RemotePort("remote-port",
-                                cl::desc("Remote execution (rsh/ssh) port"));
+static cl::opt<std::string>
+    RemotePort("remote-port", cl::desc("Remote execution (rsh/ssh) port"));
 
-cl::opt<std::string> RemoteUser("remote-user",
-                                cl::desc("Remote execution (rsh/ssh) user id"));
+static cl::opt<std::string>
+    RemoteUser("remote-user", cl::desc("Remote execution (rsh/ssh) user id"));
 
-cl::opt<std::string>
+static cl::opt<std::string>
     RemoteExtra("remote-extra-options",
                 cl::desc("Remote execution (rsh/ssh) extra options"));
-}
 
 /// RunProgramWithTimeout - This function provides an alternate interface
 /// to the sys::Program::ExecuteAndWait interface.
@@ -160,7 +156,7 @@ public:
       const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
       unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
 };
-}
+} // namespace
 
 Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
                                   const std::vector<std::string> &Args,
@@ -258,7 +254,7 @@ public:
         inconvertibleErrorCode());
   }
 };
-}
+} // namespace
 
 Error CustomCompiler::compileProgram(const std::string &Bitcode,
                                      unsigned Timeout, unsigned MemoryLimit) {
@@ -301,7 +297,7 @@ public:
       const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
       unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
 };
-}
+} // namespace
 
 Expected<int> CustomExecutor::ExecuteProgram(
     const std::string &Bitcode, const std::vector<std::string> &Args,
@@ -541,7 +537,7 @@ public:
       const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
       unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
 };
-}
+} // namespace
 
 Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
                                   const std::vector<std::string> &Args,
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index 87581e80a..52ed135 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -90,7 +90,7 @@ public:
     D.addPass(std::string(PI->getPassArgument()));
   }
 };
-}
+} // namespace
 
 #define HANDLE_EXTENSION(Ext)                                                  \
   llvm::PassPluginLibraryInfo get##Ext##PluginInfo();
diff --git a/llvm/unittests/ADT/BitFieldsTest.cpp b/llvm/unittests/ADT/BitFieldsTest.cpp
index 3062d5d..ae541fe 100644
--- a/llvm/unittests/ADT/BitFieldsTest.cpp
+++ b/llvm/unittests/ADT/BitFieldsTest.cpp
@@ -247,8 +247,8 @@ TEST(BitfieldsTest, ValueTooBigBounded) {
   Bitfield::set<A>(Storage, 0);
   Bitfield::set<A>(Storage, -1);
   Bitfield::set<A>(Storage, -2);
-  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is too big");
-  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is too small");
+  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is out of range");
+  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is out of range");
 }
 
 #endif
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index bcb1521..0fbf371 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -153,13 +153,14 @@ TEST(StringSwitchTest, EndsWithLower) {
 }
 
 TEST(StringSwitchTest, Cases) {
-  enum class OSType { Windows, Linux, Unknown };
+  enum class OSType { Windows, Linux, MacOS, Unknown };
 
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<OSType>(S)
         .Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
                OSType::Windows)
         .Cases("linux", "unix", "*nix", "posix", OSType::Linux)
+        .Cases({"macos", "osx"}, OSType::MacOS)
         .Default(OSType::Unknown);
   };
 
@@ -172,21 +173,26 @@ TEST(StringSwitchTest, Cases) {
   EXPECT_EQ(OSType::Linux, Translate("*nix"));
   EXPECT_EQ(OSType::Linux, Translate("posix"));
 
+  EXPECT_EQ(OSType::MacOS, Translate("macos"));
+  EXPECT_EQ(OSType::MacOS, Translate("osx"));
+
   // Note that the whole null-terminator embedded string is required for the
   // case to match.
   EXPECT_EQ(OSType::Unknown, Translate("wind"));
   EXPECT_EQ(OSType::Unknown, Translate("Windows"));
+  EXPECT_EQ(OSType::Unknown, Translate("MacOS"));
   EXPECT_EQ(OSType::Unknown, Translate(""));
 }
 
 TEST(StringSwitchTest, CasesLower) {
-  enum class OSType { Windows, Linux, Unknown };
+  enum class OSType { Windows, Linux, MacOS, Unknown };
 
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<OSType>(S)
         .CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
                     OSType::Windows)
         .CasesLower("linux", "unix", "*nix", "posix", OSType::Linux)
+        .CasesLower({"macos", "osx"}, OSType::MacOS)
         .Default(OSType::Unknown);
   };
 
@@ -202,6 +208,9 @@ TEST(StringSwitchTest, CasesLower) {
   EXPECT_EQ(OSType::Windows, Translate(llvm::StringRef("wind\0ws", 7)));
   EXPECT_EQ(OSType::Linux, Translate("linux"));
 
+  EXPECT_EQ(OSType::MacOS, Translate("macOS"));
+  EXPECT_EQ(OSType::MacOS, Translate("OSX"));
+
   EXPECT_EQ(OSType::Unknown, Translate("wind"));
   EXPECT_EQ(OSType::Unknown, Translate(""));
 }
diff --git a/llvm/unittests/IR/ConstantFPRangeTest.cpp b/llvm/unittests/IR/ConstantFPRangeTest.cpp
index cf9b31c..67fee96 100644
--- a/llvm/unittests/IR/ConstantFPRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantFPRangeTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/IR/ConstantFPRange.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
 #include "gtest/gtest.h"
@@ -1065,4 +1066,179 @@ TEST_F(ConstantFPRangeTest, sub) {
 #endif
 }
 
+TEST_F(ConstantFPRangeTest, mul) {
+  EXPECT_EQ(Full.mul(Full), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(Full.mul(Empty), Empty);
+  EXPECT_EQ(Empty.mul(Full), Empty);
+  EXPECT_EQ(Empty.mul(Empty), Empty);
+  EXPECT_EQ(One.mul(One), ConstantFPRange(APFloat(1.0)));
+  EXPECT_EQ(Some.mul(Some),
+            ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(9.0)));
+  EXPECT_EQ(SomePos.mul(SomeNeg),
+            ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(-0.0)));
+  EXPECT_EQ(PosInf.mul(PosInf), PosInf);
+  EXPECT_EQ(NegInf.mul(NegInf), PosInf);
+  EXPECT_EQ(PosInf.mul(Finite), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(NegInf.mul(Finite), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(PosInf.mul(NegInf), NegInf);
+  EXPECT_EQ(NegInf.mul(PosInf), NegInf);
+  EXPECT_EQ(PosZero.mul(NegZero), NegZero);
+  EXPECT_EQ(PosZero.mul(Zero), Zero);
+  EXPECT_EQ(NegZero.mul(NegZero), PosZero);
+  EXPECT_EQ(NegZero.mul(Zero), Zero);
+  EXPECT_EQ(NaN.mul(NaN), QNaN);
+  EXPECT_EQ(NaN.mul(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+  EnumerateTwoInterestingConstantFPRanges(
+      [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+        ConstantFPRange Res = LHS.mul(RHS);
+        ConstantFPRange Expected =
+            ConstantFPRange::getEmpty(LHS.getSemantics());
+        EnumerateValuesInConstantFPRange(
+            LHS,
+            [&](const APFloat &LHSC) {
+              EnumerateValuesInConstantFPRange(
+                  RHS,
+                  [&](const APFloat &RHSC) {
+                    APFloat Prod = LHSC * RHSC;
+                    EXPECT_TRUE(Res.contains(Prod))
+                        << "Wrong result for " << LHS << " * " << RHS
+                        << ". The result " << Res << " should contain " << Prod;
+                    if (!Expected.contains(Prod))
+                      Expected = Expected.unionWith(ConstantFPRange(Prod));
+                  },
+                  /*IgnoreNaNPayload=*/true);
+            },
+            /*IgnoreNaNPayload=*/true);
+        EXPECT_EQ(Res, Expected)
+            << "Suboptimal result for " << LHS << " * " << RHS << ". Expected "
+            << Expected << ", but got " << Res;
+      },
+      SparseLevel::SpecialValuesOnly);
+#endif
+}
+
+TEST_F(ConstantFPRangeTest, div) {
+  EXPECT_EQ(Full.div(Full), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(Full.div(Empty), Empty);
+  EXPECT_EQ(Empty.div(Full), Empty);
+  EXPECT_EQ(Empty.div(Empty), Empty);
+  EXPECT_EQ(One.div(One), ConstantFPRange(APFloat(1.0)));
+  EXPECT_EQ(Some.div(Some), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(SomePos.div(SomeNeg),
+            ConstantFPRange(APFloat::getInf(Sem, /*Negative=*/true),
+                            APFloat::getZero(Sem, /*Negative=*/true),
+                            /*MayBeQNaN=*/true, /*MayBeSNaN=*/false));
+  EXPECT_EQ(PosInf.div(PosInf), QNaN);
+  EXPECT_EQ(NegInf.div(NegInf), QNaN);
+  EXPECT_EQ(PosInf.div(Finite), NonNaN);
+  EXPECT_EQ(NegInf.div(Finite), NonNaN);
+  EXPECT_EQ(PosInf.div(NegInf), QNaN);
+  EXPECT_EQ(NegInf.div(PosInf), QNaN);
+  EXPECT_EQ(Zero.div(Zero), QNaN);
+  EXPECT_EQ(SomePos.div(PosInf), PosZero);
+  EXPECT_EQ(SomeNeg.div(PosInf), NegZero);
+  EXPECT_EQ(PosInf.div(SomePos), PosInf);
+  EXPECT_EQ(NegInf.div(SomeNeg), PosInf);
+  EXPECT_EQ(NegInf.div(Some), NonNaN);
+  EXPECT_EQ(NaN.div(NaN), QNaN);
+  EXPECT_EQ(NaN.div(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+  EnumerateTwoInterestingConstantFPRanges(
+      [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+        ConstantFPRange Res = LHS.div(RHS);
+        ConstantFPRange Expected =
+            ConstantFPRange::getEmpty(LHS.getSemantics());
+        EnumerateValuesInConstantFPRange(
+            LHS,
+            [&](const APFloat &LHSC) {
+              EnumerateValuesInConstantFPRange(
+                  RHS,
+                  [&](const APFloat &RHSC) {
+                    APFloat Val = LHSC / RHSC;
+                    EXPECT_TRUE(Res.contains(Val))
+                        << "Wrong result for " << LHS << " / " << RHS
+                        << ". The result " << Res << " should contain " << Val;
+                    if (!Expected.contains(Val))
+                      Expected = Expected.unionWith(ConstantFPRange(Val));
+                  },
+                  /*IgnoreNaNPayload=*/true);
+            },
+            /*IgnoreNaNPayload=*/true);
+        EXPECT_EQ(Res, Expected)
+            << "Suboptimal result for " << LHS << " / " << RHS << ". Expected "
+            << Expected << ", but got " << Res;
+      },
+      SparseLevel::SpecialValuesOnly);
+#endif
+}
+
+TEST_F(ConstantFPRangeTest, flushDenormals) {
+  const fltSemantics &FP8Sem = APFloat::Float8E4M3();
+  APFloat NormalVal = APFloat::getSmallestNormalized(FP8Sem);
+  APFloat Subnormal1 = NormalVal;
+  Subnormal1.next(/*nextDown=*/true);
+  APFloat Subnormal2 = APFloat::getSmallest(FP8Sem);
+  APFloat ZeroVal = APFloat::getZero(FP8Sem);
+  APFloat EdgeValues[8] = {-NormalVal, -Subnormal1, -Subnormal2, -ZeroVal,
+                           ZeroVal,    Subnormal2,  Subnormal1,  NormalVal};
+  constexpr DenormalMode::DenormalModeKind Modes[4] = {
+      DenormalMode::IEEE, DenormalMode::PreserveSign,
+      DenormalMode::PositiveZero, DenormalMode::Dynamic};
+  for (uint32_t I = 0; I != 8; ++I) {
+    for (uint32_t J = I; J != 8; ++J) {
+      ConstantFPRange OriginCR =
+          ConstantFPRange::getNonNaN(EdgeValues[I], EdgeValues[J]);
+      for (auto Mode : Modes) {
+        StringRef ModeName = denormalModeKindName(Mode);
+        ConstantFPRange FlushedCR = OriginCR;
+        FlushedCR.flushDenormals(Mode);
+
+        ConstantFPRange Expected = ConstantFPRange::getEmpty(FP8Sem);
+        auto CheckFlushedV = [&](const APFloat &V, const APFloat &FlushedV) {
+          EXPECT_TRUE(FlushedCR.contains(FlushedV))
+              << "Wrong result for flushDenormal(" << V << ", " << ModeName
+              << "). The result " << FlushedCR << " should contain "
+              << FlushedV;
+          if (!Expected.contains(FlushedV))
+            Expected = Expected.unionWith(ConstantFPRange(FlushedV));
+        };
+        EnumerateValuesInConstantFPRange(
+            OriginCR,
+            [&](const APFloat &V) {
+              if (V.isDenormal()) {
+                switch (Mode) {
+                case DenormalMode::IEEE:
+                  break;
+                case DenormalMode::PreserveSign:
+                  CheckFlushedV(V, APFloat::getZero(FP8Sem, V.isNegative()));
+                  break;
+                case DenormalMode::PositiveZero:
+                  CheckFlushedV(V, APFloat::getZero(FP8Sem));
+                  break;
+                case DenormalMode::Dynamic:
+                  // PreserveSign
+                  CheckFlushedV(V, APFloat::getZero(FP8Sem, V.isNegative()));
+                  // PositiveZero
+                  CheckFlushedV(V, APFloat::getZero(FP8Sem));
+                  break;
+                default:
+                  llvm_unreachable("unknown denormal mode");
+                }
+              }
+              // It is not mandated that flushing to zero occurs.
+              CheckFlushedV(V, V);
+            },
+            /*IgnoreNaNPayload=*/true);
+        EXPECT_EQ(FlushedCR, Expected)
+            << "Suboptimal result for flushDenormal(" << OriginCR << ", "
+            << ModeName << "). Expected " << Expected << ", but got "
+            << FlushedCR;
+      }
+    }
+  }
+}
+
 } // anonymous namespace
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index fe9e7e8..f4693bf 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -606,12 +606,14 @@ TEST(InstructionTest, ConstrainedTrans) {
 
 TEST(InstructionsTest, isEliminableCastPair) {
   LLVMContext C;
-  DataLayout DL1("p1:32:32");
+  DataLayout DL1("p1:32:32-p2:64:64:64:32");
 
   Type *Int16Ty = Type::getInt16Ty(C);
+  Type *Int32Ty = Type::getInt32Ty(C);
   Type *Int64Ty = Type::getInt64Ty(C);
   Type *PtrTy64 = PointerType::get(C, 0);
   Type *PtrTy32 = PointerType::get(C, 1);
+  Type *PtrTy64_32 = PointerType::get(C, 2);
 
   // Source and destination pointers have same size -> bitcast.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
@@ -637,6 +639,42 @@ TEST(InstructionsTest, isEliminableCastPair) {
                                            Int64Ty, &DL1),
             0U);
 
+  // Destination larger than source. Pointer type same as destination.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToInt, Int16Ty, PtrTy64,
+                                           Int64Ty, &DL1),
+            CastInst::ZExt);
+
+  // Destination larger than source. Pointer type different from destination.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToInt, Int16Ty, PtrTy32,
+                                           Int64Ty, &DL1),
+            CastInst::ZExt);
+
+  // Destination smaller than source. Pointer type same as source.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToInt, Int64Ty, PtrTy64,
+                                           Int16Ty, &DL1),
+            CastInst::Trunc);
+
+  // Destination smaller than source. Pointer type different from source.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToInt, Int64Ty, PtrTy32,
+                                           Int16Ty, &DL1),
+            CastInst::Trunc);
+
+  // ptrtoaddr with address size != pointer size. Truncating case.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToAddr, Int64Ty,
+                                           PtrTy64_32, Int32Ty, &DL1),
+            CastInst::Trunc);
+
+  // ptrtoaddr with address size != pointer size. Non-truncating case.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToAddr, Int32Ty,
+                                           PtrTy64_32, Int32Ty, &DL1),
+            CastInst::BitCast);
+
   // Test that we don't eliminate bitcasts between different address spaces,
   // or if we don't have available pointer size information.
   DataLayout DL2("e-p:32:32:32-p1:16:16:16-p2:64:64:64-i1:8:8-i8:8:8-i16:16:16"
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 25efa00..21f10eb 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -44,6 +44,7 @@ add_llvm_unittest(SupportTests
   ExtensibleRTTITest.cpp
   FileCollectorTest.cpp
   FileOutputBufferTest.cpp
+  Format.cpp
   FormatVariadicTest.cpp
   FSUniqueIDTest.cpp
   GenericDomTreeTest.cpp
diff --git a/llvm/unittests/Support/Format.cpp b/llvm/unittests/Support/Format.cpp
new file mode 100644
index 0000000..c4e421f
--- /dev/null
+++ b/llvm/unittests/Support/Format.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Format.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+template <typename FormatTy>
+std::string printToString(unsigned MaxN, FormatTy &&Fmt) {
+  std::vector<char> Dst(MaxN + 2);
+  int N = Fmt.snprint(Dst.data(), Dst.size());
+  Dst.back() = 0;
+  return N < 0 ? "" : Dst.data();
+}
+
+template <typename Expected, typename Arg>
+constexpr bool checkDecayTypeEq(const Arg &arg) {
+  return std::is_same_v<detail::decay_if_c_char_array_t<Arg>, Expected>;
+}
+
+TEST(Format, DecayIfCCharArray) {
+  char Array[] = "Array";
+  const char ConstArray[] = "ConstArray";
+  char PtrBuf[] = "Ptr";
+  char *Ptr = PtrBuf;
+  const char *PtrToConst = "PtrToConst";
+
+  EXPECT_EQ("        Literal", printToString(20, format("%15s", "Literal")));
+  EXPECT_EQ("          Array", printToString(20, format("%15s", Array)));
+  EXPECT_EQ("     ConstArray", printToString(20, format("%15s", ConstArray)));
+  EXPECT_EQ("            Ptr", printToString(20, format("%15s", Ptr)));
+  EXPECT_EQ("     PtrToConst", printToString(20, format("%15s", PtrToConst)));
+
+  EXPECT_TRUE(checkDecayTypeEq<const char *>("Literal"));
+  EXPECT_TRUE(checkDecayTypeEq<const char *>(Array));
+  EXPECT_TRUE(checkDecayTypeEq<const char *>(ConstArray));
+  EXPECT_TRUE(checkDecayTypeEq<char *>(Ptr));
+  EXPECT_TRUE(checkDecayTypeEq<const char *>(PtrToConst));
+  EXPECT_TRUE(checkDecayTypeEq<char>(PtrToConst[0]));
+  EXPECT_TRUE(
+      checkDecayTypeEq<const char *>(static_cast<const char *>("Literal")));
+
+  wchar_t WCharArray[] = L"WCharArray";
+  EXPECT_TRUE(checkDecayTypeEq<wchar_t[11]>(WCharArray));
+  EXPECT_TRUE(checkDecayTypeEq<wchar_t>(WCharArray[0]));
+}
+
+} // namespace
diff --git a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
index c74d157..5ac4c53 100644
--- a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
@@ -177,6 +177,57 @@ TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_VASHR) {
   EXPECT_EQ(DAG->ComputeNumSignBits(Fr2), 5u);
 }
 
+TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_SUB) {
+  SDLoc Loc;
+  auto IntVT = EVT::getIntegerVT(Context, 8);
+  auto N0 = DAG->getConstant(0x00, Loc, IntVT);
+  auto N1 = DAG->getConstant(0x01, Loc, IntVT);
+  auto N5 = DAG->getConstant(0x05, Loc, IntVT);
+  auto Nsign1 = DAG->getConstant(0x55, Loc, IntVT);
+  auto UnknownOp = DAG->getRegister(0, IntVT);
+  auto Mask = DAG->getConstant(0x1e, Loc, IntVT);
+  auto Nsign3 = DAG->getNode(ISD::AND, Loc, IntVT, Mask, UnknownOp);
+  // RHS early out
+  // Nsign1 = 01010101
+  // Nsign3 = 000????0
+  auto OpRhsEo = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign3, Nsign1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpRhsEo), 1u);
+
+  // Neg 0
+  // N0 = 00000000
+  auto OpNegZero = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N0);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNegZero), 8u);
+
+  // Neg 1
+  // N0 = 00000000
+  // N1 = 00000001
+  auto OpNegOne = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNegOne), 8u);
+
+  // Neg 5
+  // N0 = 00000000
+  // N5 = 00000101
+  auto OpNegFive = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N5);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNegFive), 5u);
+
+  // Non negative
+  // N0     = 00000000
+  // Nsign3 = 000????0
+  auto OpNonNeg = DAG->getNode(ISD::SUB, Loc, IntVT, N0, Nsign3);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNonNeg), 3u);
+
+  // LHS early out
+  // Nsign1 = 01010101
+  // Nsign3 = 000????0
+  auto OpLhsEo = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign1, Nsign3);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpLhsEo), 1u);
+
+  // Nsign3 = 000????0
+  // N5     = 00000101
+  auto Op = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign3, N5);
+  EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
+}
+
 TEST_F(AArch64SelectionDAGTest, SimplifyDemandedVectorElts_EXTRACT_SUBVECTOR) {
   TargetLowering TL(*TM);
 
diff --git a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
index 841f44c..716f5f2 100644
--- a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
+++ b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
@@ -308,3 +308,223 @@ TEST(SSAUpdaterBulk, TwoBBLoop) {
   EXPECT_EQ(Phi->getIncomingValueForBlock(Entry), ConstantInt::get(I32Ty, 0));
   EXPECT_EQ(Phi->getIncomingValueForBlock(Loop), I);
 }
+
+TEST(SSAUpdaterBulk, SimplifyPHIs) {
+  const char *IR = R"(
+      define void @main(i32 %val, i1 %cond) {
+      entry:
+          br i1 %cond, label %left, label %right
+      left:
+          %add = add i32 %val, 1
+          br label %exit
+      right:
+          %sub = sub i32 %val, 1
+          br label %exit
+      exit:
+          %phi = phi i32 [ %sub, %right ], [ %add, %left ]
+          %cmp = icmp slt i32 0, 42
+          ret void
+      }
+  )";
+
+  llvm::LLVMContext Context;
+  llvm::SMDiagnostic Err;
+  std::unique_ptr<llvm::Module> M = llvm::parseAssemblyString(IR, Err, Context);
+  ASSERT_NE(M, nullptr) << "Failed to parse IR: " << Err.getMessage();
+
+  Function *F = M->getFunction("main");
+  auto *Entry = &F->getEntryBlock();
+  auto *Left = Entry->getTerminator()->getSuccessor(0);
+  auto *Right = Entry->getTerminator()->getSuccessor(1);
+  auto *Exit = Left->getSingleSuccessor();
+  auto *Val = &*F->arg_begin();
+  auto *Phi = &Exit->front();
+  auto *Cmp = &*std::next(Exit->begin());
+  auto *Add = &Left->front();
+  auto *Sub = &Right->front();
+
+  SSAUpdaterBulk Updater;
+  Type *I32Ty = Type::getInt32Ty(Context);
+
+  // Use %val directly instead of creating a phi.
+  unsigned ValVar = Updater.AddVariable("Val", I32Ty);
+  Updater.AddAvailableValue(ValVar, Left, Val);
+  Updater.AddAvailableValue(ValVar, Right, Val);
+  Updater.AddUse(ValVar, &Cmp->getOperandUse(0));
+
+  // Use existing %phi for %add and %sub values.
+  unsigned AddSubVar = Updater.AddVariable("AddSub", I32Ty);
+  Updater.AddAvailableValue(AddSubVar, Left, Add);
+  Updater.AddAvailableValue(AddSubVar, Right, Sub);
+  Updater.AddUse(AddSubVar, &Cmp->getOperandUse(1));
+
+  auto ExitSizeBefore = Exit->size();
+  DominatorTree DT(*F);
+  Updater.RewriteAndOptimizeAllUses(DT);
+
+  //  Output for Exit->dump():
+  //  exit:                                             ; preds = %right, %left
+  //    %phi = phi i32 [ %sub, %right ], [ %add, %left ]
+  //    %cmp = icmp slt i32 %val, %phi
+  //    ret void
+
+  ASSERT_EQ(Exit->size(), ExitSizeBefore);
+  ASSERT_EQ(&Exit->front(), Phi);
+  EXPECT_EQ(Val, Cmp->getOperand(0));
+  EXPECT_EQ(Phi, Cmp->getOperand(1));
+}
+
+bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
+                                   BasicBlock::phi_iterator FirstExistingPN);
+
+// Helper to run both versions on the same input.
+static void RunEliminateNewDuplicatePHINode(
+    const char *AsmText,
+    std::function<void(BasicBlock &,
+                       bool(BasicBlock *BB, BasicBlock::phi_iterator))>
+        Check) {
+  LLVMContext C;
+
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyString(AsmText, Err, C);
+  if (!M) {
+    Err.print("UtilsTests", errs());
+    return;
+  }
+
+  Function *F = M->getFunction("main");
+  auto BBIt = std::find_if(F->begin(), F->end(), [](const BasicBlock &Block) {
+    return Block.getName() == "testbb";
+  });
+  ASSERT_NE(BBIt, F->end());
+  Check(*BBIt, EliminateNewDuplicatePHINodes);
+}
+
+static BasicBlock::phi_iterator getPhiIt(BasicBlock &BB, unsigned Idx) {
+  return std::next(BB.phis().begin(), Idx);
+}
+
+static PHINode *getPhi(BasicBlock &BB, unsigned Idx) {
+  return &*getPhiIt(BB, Idx);
+}
+
+static int getNumPHIs(BasicBlock &BB) {
+  return std::distance(BB.phis().begin(), BB.phis().end());
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_OrderExisting) {
+  RunEliminateNewDuplicatePHINode(R"(
+      define void @main() {
+      entry:
+          br label %testbb
+      testbb:
+          %np0 = phi i32 [ 1, %entry ]
+          %np1 = phi i32 [ 1, %entry ]
+          %ep0 = phi i32 [ 1, %entry ]
+          %ep1 = phi i32 [ 1, %entry ]
+          %u = add i32 %np0, %np1
+          ret void
+      }
+  )", [](BasicBlock &BB, auto *ENDPN) {
+    AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+    AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+    EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+    // Expected:
+    //   %ep0 = phi i32 [ 1, %entry ]
+    //   %ep1 = phi i32 [ 1, %entry ]
+    //   %u = add i32 %ep0, %ep0
+    EXPECT_EQ(getNumPHIs(BB), 2);
+    Instruction &Add = *BB.getFirstNonPHIIt();
+    EXPECT_EQ(Add.getOperand(0), EP0);
+    EXPECT_EQ(Add.getOperand(1), EP0);
+    (void)EP1; // Avoid "unused" warning.
+  });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_OrderNew) {
+  RunEliminateNewDuplicatePHINode(R"(
+      define void @main() {
+      entry:
+          br label %testbb
+      testbb:
+          %np0 = phi i32 [ 1, %entry ]
+          %np1 = phi i32 [ 1, %entry ]
+          %ep0 = phi i32 [ 2, %entry ]
+          %ep1 = phi i32 [ 2, %entry ]
+          %u = add i32 %np0, %np1
+          ret void
+      }
+  )", [](BasicBlock &BB, auto *ENDPN) {
+    AssertingVH<PHINode> NP0 = getPhi(BB, 0);
+    AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+    AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+    EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+    // Expected:
+    //   %np0 = phi i32 [ 1, %entry ]
+    //   %ep0 = phi i32 [ 2, %entry ]
+    //   %ep1 = phi i32 [ 2, %entry ]
+    //   %u = add i32 %np0, %np0
+    EXPECT_EQ(getNumPHIs(BB), 3);
+    Instruction &Add = *BB.getFirstNonPHIIt();
+    EXPECT_EQ(Add.getOperand(0), NP0);
+    EXPECT_EQ(Add.getOperand(1), NP0);
+    (void)EP0;
+    (void)EP1; // Avoid "unused" warning.
+  });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_NewRefExisting) {
+  RunEliminateNewDuplicatePHINode(R"(
+      define void @main() {
+      entry:
+          br label %testbb
+      testbb:
+          %np0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+          %np1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+          %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+          %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+          %u = add i32 %np0, %np1
+          br label %testbb
+      }
+  )", [](BasicBlock &BB, auto *ENDPN) {
+    AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+    AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+    EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+    // Expected:
+    //   %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+    //   %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+    //   %u = add i32 %ep0, %ep1
+    EXPECT_EQ(getNumPHIs(BB), 2);
+    Instruction &Add = *BB.getFirstNonPHIIt();
+    EXPECT_EQ(Add.getOperand(0), EP0);
+    EXPECT_EQ(Add.getOperand(1), EP1);
+  });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_ExistingRefNew) {
+  RunEliminateNewDuplicatePHINode(R"(
+      define void @main() {
+      entry:
+          br label %testbb
+      testbb:
+          %np0 = phi i32 [ 1, %entry ], [ %np0, %testbb ]
+          %np1 = phi i32 [ 1, %entry ], [ %np1, %testbb ]
+          %ep0 = phi i32 [ 1, %entry ], [ %np0, %testbb ]
+          %ep1 = phi i32 [ 1, %entry ], [ %np1, %testbb ]
+          %u = add i32 %np0, %np1
+          br label %testbb
+      }
+  )", [](BasicBlock &BB, auto *ENDPN) {
+    AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+    AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+    EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+    // Expected:
+    //   %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+    //   %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+    //   %u = add i32 %ep0, %ep1
+    EXPECT_EQ(getNumPHIs(BB), 2);
+    Instruction &Add = *BB.getFirstNonPHIIt();
+    EXPECT_EQ(Add.getOperand(0), EP0);
+    EXPECT_EQ(Add.getOperand(1), EP1);
+  });
+}
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 55b68f5..2a0f500 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -45,8 +45,7 @@ TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
 
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
-  VPDominatorTree VPDT;
-  VPDT.recalculate(Plan);
+  VPDominatorTree VPDT(Plan);
 
   EXPECT_TRUE(VPDT.dominates(VPBB1, VPBB4));
   EXPECT_FALSE(VPDT.dominates(VPBB4, VPBB1));
@@ -118,8 +117,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     VPBlockUtils::connectBlocks(R1, R2);
 
     VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
-    VPDominatorTree VPDT;
-    VPDT.recalculate(Plan);
+    VPDominatorTree VPDT(Plan);
 
     checkDomChildren(VPDT, R1, {R1BB1});
     checkDomChildren(VPDT, R1BB1, {R1BB2, R1BB4, R1BB3});
@@ -197,8 +195,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
     VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
-    VPDominatorTree VPDT;
-    VPDT.recalculate(Plan);
+    VPDominatorTree VPDT(Plan);
 
     checkDomChildren(VPDT, VPBB1, {R1});
     checkDomChildren(VPDT, R1, {R1BB1});
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index bdcb8a3..343c2bb71 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1129,6 +1129,7 @@ Transforms/LowerIFunc/ifunc-alias.ll
 Transforms/LowerIFunc/ifunc-nonsense-resolvers.ll
 Transforms/LowerIFunc/ifunc-program-addrspace.ll
 Transforms/LowerIFunc/lower-ifunc.ll
+Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
 Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
 Transforms/LowerMatrixIntrinsics/multiply-fused.ll
 Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -1311,82 +1312,6 @@ Transforms/SimpleLoopUnswitch/pr60736.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
-Transforms/SLPVectorizer/AArch64/gather-root.ll
-Transforms/SLPVectorizer/AArch64/horizontal.ll
-Transforms/SLPVectorizer/AArch64/loadi8.ll
-Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/AArch64/uselistorder.ll
-Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
-Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
-Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
-Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll
-Transforms/SLPVectorizer/const-bool-logical-or-reduction.ll
-Transforms/SLPVectorizer/extracts-with-undefs.ll
-Transforms/SLPVectorizer/freeze-signedness-missed.ll
-Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
-Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll
-Transforms/SLPVectorizer/insert-element-build-vector-const.ll
-Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
-Transforms/SLPVectorizer/insert-element-build-vector.ll
-Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
-Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
-Transforms/SLPVectorizer/minbitwidth-user-not-min.ll
-Transforms/SLPVectorizer/partial-register-extract.ll
-Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
-Transforms/SLPVectorizer/reorder-node.ll
-Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll
-Transforms/SLPVectorizer/revec.ll
-Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
-Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
-Transforms/SLPVectorizer/RISCV/reordered-interleaved-loads.ll
-Transforms/SLPVectorizer/RISCV/revec.ll
-Transforms/SLPVectorizer/RISCV/select-profitability.ll
-Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
-Transforms/SLPVectorizer/RISCV/unsigned-node-trunc-with-signed-users.ll
-Transforms/SLPVectorizer/slp-deleted-inst.ll
-Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
-Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
-Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll
-Transforms/SLPVectorizer/X86/bool-mask.ll
-Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
-Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
-Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
-Transforms/SLPVectorizer/X86/cmp_sel.ll
-Transforms/SLPVectorizer/X86/crash_7zip.ll
-Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
-Transforms/SLPVectorizer/X86/crash_cmpop.ll
-Transforms/SLPVectorizer/X86/debug-counter.ll
-Transforms/SLPVectorizer/X86/debug-info-salvage.ll
-Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
-Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
-Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
-Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
-Transforms/SLPVectorizer/X86/horizontal-minmax.ll
-Transforms/SLPVectorizer/X86/insert-after-bundle.ll
-Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
-Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
-Transforms/SLPVectorizer/X86/minbw-user-non-sizable.ll
-Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
-Transforms/SLPVectorizer/X86/ordering-bug.ll
-Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll
-Transforms/SLPVectorizer/X86/pr46983.ll
-Transforms/SLPVectorizer/X86/pr49933.ll
-Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
-Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
-Transforms/SLPVectorizer/X86/reduction-logical.ll
-Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
-Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
-Transforms/SLPVectorizer/X86/select-reduction-op.ll
-Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
-Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
-Transforms/SLPVectorizer/X86/undef_vect.ll
-Transforms/SLPVectorizer/X86/used-reduced-op.ll
-Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
-Transforms/SLPVectorizer/X86/whole-registers-compare.ll
 Transforms/SROA/addrspacecast.ll
 Transforms/SROA/phi-and-select.ll
 Transforms/SROA/phi-gep.ll