122 files changed, 3646 insertions, 851 deletions
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 4064d71..1af2761 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -86,89 +86,43 @@
 #include <limits>  // numeric_limits
 #include <type_traits>
 
+#include "llvm/Support/MathExtras.h"
+
 namespace llvm {
 
 namespace bitfields_details {
 
-/// A struct defining useful bit patterns for n-bits integer types.
-template <typename T, unsigned Bits> struct BitPatterns {
-  /// Bit patterns are forged using the equivalent `Unsigned` type because of
-  /// undefined operations over signed types (e.g. Bitwise shift operators).
-  /// Moreover same size casting from unsigned to signed is well defined but not
-  /// the other way around.
-  using Unsigned = std::make_unsigned_t<T>;
-  static_assert(sizeof(Unsigned) == sizeof(T), "Types must have same size");
-
-  static constexpr unsigned TypeBits = sizeof(Unsigned) * CHAR_BIT;
-  static_assert(TypeBits >= Bits, "n-bit must fit in T");
-
-  /// e.g. with TypeBits == 8 and Bits == 6.
-  static constexpr Unsigned AllZeros = Unsigned(0);                  // 00000000
-  static constexpr Unsigned AllOnes = ~Unsigned(0);                  // 11111111
-  static constexpr Unsigned Umin = AllZeros;                         // 00000000
-  static constexpr Unsigned Umax = AllOnes >> (TypeBits - Bits);     // 00111111
-  static constexpr Unsigned SignBitMask = Unsigned(1) << (Bits - 1); // 00100000
-  static constexpr Unsigned Smax = Umax >> 1U;                       // 00011111
-  static constexpr Unsigned Smin = ~Smax;                            // 11100000
-  static constexpr Unsigned SignExtend = Unsigned(Smin << 1U);       // 11000000
-};
-
-/// `Compressor` is used to manipulate the bits of a (possibly signed) integer
-/// type so it can be packed and unpacked into a `bits` sized integer,
-/// `Compressor` is specialized on signed-ness so no runtime cost is incurred.
-/// The `pack` method also checks that the passed in `UserValue` is valid.
-template <typename T, unsigned Bits, bool = std::is_unsigned<T>::value>
-struct Compressor {
-  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
-  using BP = BitPatterns<T, Bits>;
-
-  static T pack(T UserValue, T UserMaxValue) {
-    assert(UserValue <= UserMaxValue && "value is too big");
-    assert(UserValue <= BP::Umax && "value is too big");
-    return UserValue;
-  }
-
-  static T unpack(T StorageValue) { return StorageValue; }
-};
-
-template <typename T, unsigned Bits> struct Compressor<T, Bits, false> {
-  static_assert(std::is_signed<T>::value, "T must be signed");
-  using BP = BitPatterns<T, Bits>;
-
-  static T pack(T UserValue, T UserMaxValue) {
-    assert(UserValue <= UserMaxValue && "value is too big");
-    assert(UserValue <= T(BP::Smax) && "value is too big");
-    assert(UserValue >= T(BP::Smin) && "value is too small");
-    if (UserValue < 0)
-      UserValue &= ~BP::SignExtend;
-    return UserValue;
-  }
-
-  static T unpack(T StorageValue) {
-    if (StorageValue >= T(BP::SignBitMask))
-      StorageValue |= BP::SignExtend;
-    return StorageValue;
-  }
-};
-
 /// Impl is where Bifield description and Storage are put together to interact
 /// with values.
 template <typename Bitfield, typename StorageType> struct Impl {
   static_assert(std::is_unsigned<StorageType>::value,
                 "Storage must be unsigned");
   using IntegerType = typename Bitfield::IntegerType;
-  using C = Compressor<IntegerType, Bitfield::Bits>;
-  using BP = BitPatterns<StorageType, Bitfield::Bits>;
 
   static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT;
   static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask");
   static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask");
-  static constexpr StorageType Mask = BP::Umax << Bitfield::Shift;
+  static constexpr StorageType LowMask =
+      maskTrailingOnes<StorageType>(Bitfield::Bits);
+  static constexpr StorageType Mask = LowMask << Bitfield::Shift;
+
+  /// Validates that `UserValue` fits within the bitfield's range.
+  static void checkValue(IntegerType UserValue, IntegerType UserMaxValue) {
+    assert(UserValue <= UserMaxValue && "value is too big");
+    if constexpr (std::is_unsigned_v<IntegerType>) {
+      assert(isUInt<Bitfield::Bits>(UserValue) && "value is too big");
+    } else {
+      static_assert(std::is_signed_v<IntegerType>,
+                    "IntegerType must be signed");
+      assert(isInt<Bitfield::Bits>(UserValue) && "value is out of range");
+    }
+  }
 
   /// Checks `UserValue` is within bounds and packs it between `FirstBit` and
   /// `LastBit` of `Packed` leaving the rest unchanged.
   static void update(StorageType &Packed, IntegerType UserValue) {
-    const StorageType StorageValue = C::pack(UserValue, Bitfield::UserMaxValue);
+    checkValue(UserValue, Bitfield::UserMaxValue);
+    const StorageType StorageValue = UserValue & LowMask;
     Packed &= ~Mask;
     Packed |= StorageValue << Bitfield::Shift;
   }
@@ -177,7 +131,9 @@ template <typename Bitfield, typename StorageType> struct Impl {
   /// an`IntegerType`.
   static IntegerType extract(StorageType Packed) {
     const StorageType StorageValue = (Packed & Mask) >> Bitfield::Shift;
-    return C::unpack(StorageValue);
+    if constexpr (std::is_signed_v<IntegerType>)
+      return SignExtend64<Bitfield::Bits>(StorageValue);
+    return StorageValue;
   }
 
   /// Interprets bits between `FirstBit` and `LastBit` of `Packed` as
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 164b46b..07a482d 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -182,6 +182,12 @@ m_scev_PtrToInt(const Op0_t &Op0) {
   return SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>(Op0);
 }
 
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVTruncateExpr, Op0_t>
+m_scev_Trunc(const Op0_t &Op0) {
+  return m_scev_Unary<SCEVTruncateExpr>(Op0);
+}
+
 /// Match a binary SCEV.
 template <typename SCEVTy, typename Op0_t, typename Op1_t,
           SCEV::NoWrapFlags WrapFlags = SCEV::FlagAnyWrap,
diff --git a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
index fa21eba..f06e7ce 100644
--- a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
+++ b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
@@ -10,6 +10,24 @@
 
 namespace llvm {
 
+namespace memprof {
+// Represents the eligibility status of a global variable for section prefix
+// annotation. Other than AnnotationOk, each enum value indicates a specific
+// reason for ineligibility.
+enum class AnnotationKind : uint8_t {
+  AnnotationOK,
+  DeclForLinker,
+  ExplicitSection,
+  ReservedName,
+};
+/// Returns the annotation kind of the global variable \p GV.
+AnnotationKind getAnnotationKind(const GlobalVariable &GV);
+
+/// Returns true if the annotation kind of the global variable \p GV is
+/// AnnotationOK.
+bool IsAnnotationOK(const GlobalVariable &GV);
+} // namespace memprof
+
 /// A class that holds the constants that represent static data and their
 /// profile information and provides methods to operate on them.
 class StaticDataProfileInfo {
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index c76c83d..ff3dd0d 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -514,6 +514,12 @@ enum NodeType {
   /// separately rounded operations.
   FMAD,
 
+  /// FMULADD - Performs a * b + c, with, or without, intermediate rounding.
+  /// It is expected that this will be illegal for most targets, as it usually
+  /// makes sense to split this or use an FMA. But some targets, such as
+  /// WebAssembly, can directly support these semantics.
+  FMULADD,
+
   /// FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.  NOTE: This
   /// DAG node does not require that X and Y have the same type, just that
   /// they are both floating point.  X and the result must have the same type.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 1694a33..46b3d53 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -472,7 +472,7 @@ __OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr, KernelLaunchEn
 __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
-          FuncPtrTy, VoidPtr, VoidPtrPtr, SizeTy)
+          FuncPtrTy, FuncPtrTy, VoidPtrPtr, SizeTy)
 __OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
 __OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
 __OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
diff --git a/llvm/include/llvm/IR/ConstantFPRange.h b/llvm/include/llvm/IR/ConstantFPRange.h
index 10467cc..e772095 100644
--- a/llvm/include/llvm/IR/ConstantFPRange.h
+++ b/llvm/include/llvm/IR/ConstantFPRange.h
@@ -231,6 +231,15 @@ public:
   /// from a subtraction of a value in this range and a value in \p Other.
   LLVM_ABI ConstantFPRange sub(const ConstantFPRange &Other) const;
 
+  /// Return a new range representing the possible values resulting
+  /// from a multiplication of a value in this range and a value in \p Other.
+  LLVM_ABI ConstantFPRange mul(const ConstantFPRange &Other) const;
+
+  /// Return a new range representing the possible values resulting
+  /// from a division of a value in this range and a value in
+  /// \p Other.
+  LLVM_ABI ConstantFPRange div(const ConstantFPRange &Other) const;
+
   /// Flush denormal values to zero according to the specified mode.
   /// For dynamic mode, we return the union of all possible results.
   LLVM_ABI void flushDenormals(DenormalMode::DenormalModeKind Mode);
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 041a4ce..dacda0a 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2548,6 +2548,11 @@ public:
       std::optional<RoundingMode> Rounding = std::nullopt,
       std::optional<fp::ExceptionBehavior> Except = std::nullopt);
 
+  LLVM_ABI Value *CreateSelectWithUnknownProfile(Value *C, Value *True,
+                                                 Value *False,
+                                                 StringRef PassName,
+                                                 const Twine &Name = "");
+
   LLVM_ABI Value *CreateSelect(Value *C, Value *True, Value *False,
                                const Twine &Name = "",
                                Instruction *MDFrom = nullptr);
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 632be7a..07a858f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -535,6 +535,7 @@ def fdiv       : SDNode<"ISD::FDIV"       , SDTFPBinOp>;
 def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
 def fma        : SDNode<"ISD::FMA"        , SDTFPTernaryOp, [SDNPCommutative]>;
 def fmad       : SDNode<"ISD::FMAD"       , SDTFPTernaryOp, [SDNPCommutative]>;
+def fmuladd    : SDNode<"ISD::FMULADD"    , SDTFPTernaryOp, [SDNPCommutative]>;
 def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
 def fminnum    : SDNode<"ISD::FMINNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index b1fca55..2ac58a5 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -161,6 +161,8 @@ inline static bool isAltFmt(unsigned VType) { return VType & 0x100; }
 
 LLVM_ABI void printVType(unsigned VType, raw_ostream &OS);
 
+LLVM_ABI void printXSfmmVType(unsigned VType, raw_ostream &OS);
+
 LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
 
 LLVM_ABI std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL,
diff --git a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
index 558984f..eb2b34d 100644
--- a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
@@ -12,9 +12,7 @@
 #ifndef LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
 #define LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
 
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
 
 // True if I is trivially rematerialzable, e.g. InsertElementInst
 LLVM_ABI bool isTriviallyMaterializable(Instruction &I);
@@ -24,8 +22,6 @@ LLVM_ABI void
 doRematerializations(Function &F, SuspendCrossingInfo &Checker,
                      std::function<bool(Instruction &)> IsMaterializable);
 
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
 
 #endif // LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
diff --git a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
index 6cdf83c0..356f9ca 100644
--- a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
@@ -13,9 +13,7 @@
 #ifndef LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
 #define LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
 
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
 
 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
 
@@ -38,6 +36,7 @@ void collectSpillsAndAllocasFromInsts(
     SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
     const SuspendCrossingInfo &Checker, const DominatorTree &DT,
     const coro::Shape &Shape);
+
 void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
                               const SuspendCrossingInfo &Checker);
 
@@ -52,8 +51,6 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &DT,
 BasicBlock::iterator getSpillInsertionPt(const coro::Shape &, Value *Def,
                                          const DominatorTree &DT);
 
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
 
 #endif // LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
diff --git a/llvm/include/llvm/XRay/BlockIndexer.h b/llvm/include/llvm/XRay/BlockIndexer.h
index e9782da..155e6bd 100644
--- a/llvm/include/llvm/XRay/BlockIndexer.h
+++ b/llvm/include/llvm/XRay/BlockIndexer.h
@@ -19,8 +19,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 // The BlockIndexer will gather all related records associated with a
 // process+thread and group them by 'Block'.
@@ -63,7 +62,6 @@ public:
   Error flush();
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_BLOCKINDEXER_H
diff --git a/llvm/include/llvm/XRay/BlockPrinter.h b/llvm/include/llvm/XRay/BlockPrinter.h
index caf78c5..81944a5 100644
--- a/llvm/include/llvm/XRay/BlockPrinter.h
+++ b/llvm/include/llvm/XRay/BlockPrinter.h
@@ -18,8 +18,7 @@
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/RecordPrinter.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class LLVM_ABI BlockPrinter : public RecordVisitor {
   enum class State {
@@ -55,7 +54,6 @@ public:
   void reset() { CurrentState = State::Start; }
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_BLOCKPRINTER_H
diff --git a/llvm/include/llvm/XRay/BlockVerifier.h b/llvm/include/llvm/XRay/BlockVerifier.h
index b88785c..5e7b25c 100644
--- a/llvm/include/llvm/XRay/BlockVerifier.h
+++ b/llvm/include/llvm/XRay/BlockVerifier.h
@@ -16,8 +16,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class LLVM_ABI BlockVerifier : public RecordVisitor {
 public:
@@ -64,7 +63,6 @@ public:
   void reset();
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_BLOCKVERIFIER_H
diff --git a/llvm/include/llvm/XRay/FDRLogBuilder.h b/llvm/include/llvm/XRay/FDRLogBuilder.h
index f07c446..5f7b815 100644
--- a/llvm/include/llvm/XRay/FDRLogBuilder.h
+++ b/llvm/include/llvm/XRay/FDRLogBuilder.h
@@ -10,8 +10,7 @@
 
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// The LogBuilder class allows for creating ad-hoc collections of records
 /// through the `add<...>(...)` function. An example use of this API is in
@@ -34,7 +33,6 @@ public:
   std::vector<std::unique_ptr<Record>> consume() { return std::move(Records); }
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRLOGBUILDER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 473777f..13bb711 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -15,8 +15,7 @@
 #include <memory>
 #include <vector>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class RecordConsumer {
 public:
@@ -48,7 +47,6 @@ public:
   Error consume(std::unique_ptr<Record> R) override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRRECORDCONSUMER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordProducer.h b/llvm/include/llvm/XRay/FDRRecordProducer.h
index 083b571..b953f62 100644
--- a/llvm/include/llvm/XRay/FDRRecordProducer.h
+++ b/llvm/include/llvm/XRay/FDRRecordProducer.h
@@ -14,8 +14,7 @@
 #include "llvm/XRay/XRayRecord.h"
 #include <memory>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class RecordProducer {
 public:
@@ -45,7 +44,6 @@ public:
   Expected<std::unique_ptr<Record>> produce() override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRRECORDPRODUCER_H
diff --git a/llvm/include/llvm/XRay/FDRRecords.h b/llvm/include/llvm/XRay/FDRRecords.h
index 7ee8db6..91689cae 100644
--- a/llvm/include/llvm/XRay/FDRRecords.h
+++ b/llvm/include/llvm/XRay/FDRRecords.h
@@ -23,8 +23,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class RecordVisitor;
 class RecordInitializer;
@@ -444,7 +443,6 @@ public:
   Error visit(TypedEventRecord &) override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRRECORDS_H
diff --git a/llvm/include/llvm/XRay/FDRTraceExpander.h b/llvm/include/llvm/XRay/FDRTraceExpander.h
index 197c123..ca400c9 100644
--- a/llvm/include/llvm/XRay/FDRTraceExpander.h
+++ b/llvm/include/llvm/XRay/FDRTraceExpander.h
@@ -17,8 +17,7 @@
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class TraceExpander : public RecordVisitor {
   // Type-erased callback for handling individual XRayRecord instances.
@@ -56,7 +55,6 @@ public:
   Error flush();
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRTRACEEXPANDER_H
diff --git a/llvm/include/llvm/XRay/FDRTraceWriter.h b/llvm/include/llvm/XRay/FDRTraceWriter.h
index a3dc58e..957039d 100644
--- a/llvm/include/llvm/XRay/FDRTraceWriter.h
+++ b/llvm/include/llvm/XRay/FDRTraceWriter.h
@@ -18,8 +18,7 @@
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// The FDRTraceWriter allows us to hand-craft an XRay Flight Data Recorder
 /// (FDR) mode log file. This is used primarily for testing, generating
@@ -50,7 +49,6 @@ private:
   support::endian::Writer OS;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FDRTRACEWRITER_H
diff --git a/llvm/include/llvm/XRay/FileHeaderReader.h b/llvm/include/llvm/XRay/FileHeaderReader.h
index ecdb975..758ca29 100644
--- a/llvm/include/llvm/XRay/FileHeaderReader.h
+++ b/llvm/include/llvm/XRay/FileHeaderReader.h
@@ -19,15 +19,13 @@
 #include "llvm/XRay/XRayRecord.h"
 #include <cstdint>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// Convenience function for loading the file header given a data extractor at a
 /// specified offset.
 LLVM_ABI Expected<XRayFileHeader>
 readBinaryFormatHeader(DataExtractor &HeaderExtractor, uint64_t &OffsetPtr);
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_FILEHEADERREADER_H
diff --git a/llvm/include/llvm/XRay/Graph.h b/llvm/include/llvm/XRay/Graph.h
index 07b418b..8521e09 100644
--- a/llvm/include/llvm/XRay/Graph.h
+++ b/llvm/include/llvm/XRay/Graph.h
@@ -23,8 +23,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Error.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// A Graph object represents a Directed Graph and is used in XRay to compute
 /// and store function call graphs and associated statistical information.
@@ -485,6 +484,6 @@ public:
     return p;
   }
 };
-}
-}
+} // namespace llvm::xray
+
 #endif
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index b5371478..c5e7ebf 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -23,9 +23,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace llvm {
-
-namespace xray {
+namespace llvm::xray {
 
 // Forward declare to make a friend.
 class InstrumentationMap;
@@ -102,11 +100,11 @@ public:
   const SledContainer &sleds() const { return Sleds; };
 };
 
-} // end namespace xray
-
-namespace yaml {
+} // end namespace llvm::xray
 
-template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
+namespace llvm {
+template <>
+struct yaml::ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
   static void enumeration(IO &IO, xray::SledEntry::FunctionKinds &Kind) {
     IO.enumCase(Kind, "function-enter", xray::SledEntry::FunctionKinds::ENTRY);
     IO.enumCase(Kind, "function-exit", xray::SledEntry::FunctionKinds::EXIT);
@@ -118,7 +116,7 @@ template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
   }
 };
 
-template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
+template <> struct yaml::MappingTraits<xray::YAMLXRaySledEntry> {
   static void mapping(IO &IO, xray::YAMLXRaySledEntry &Entry) {
     IO.mapRequired("id", Entry.FuncId);
     IO.mapRequired("address", Entry.Address);
@@ -131,10 +129,7 @@ template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
 
   static constexpr bool flow = true;
 };
-
-} // end namespace yaml
-
-} // end namespace llvm
+} // namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRaySledEntry)
 
diff --git a/llvm/include/llvm/XRay/Profile.h b/llvm/include/llvm/XRay/Profile.h
index e30c01e..b5b8dd2 100644
--- a/llvm/include/llvm/XRay/Profile.h
+++ b/llvm/include/llvm/XRay/Profile.h
@@ -22,8 +22,7 @@
 #include <utility>
 #include <vector>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class Profile;
 
@@ -144,7 +143,6 @@ public:
   bool empty() const { return Blocks.empty(); }
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif
diff --git a/llvm/include/llvm/XRay/RecordPrinter.h b/llvm/include/llvm/XRay/RecordPrinter.h
index 5d2c277..3281221 100644
--- a/llvm/include/llvm/XRay/RecordPrinter.h
+++ b/llvm/include/llvm/XRay/RecordPrinter.h
@@ -17,8 +17,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class LLVM_ABI RecordPrinter : public RecordVisitor {
   raw_ostream &OS;
@@ -44,7 +43,6 @@ public:
   Error visit(TypedEventRecord &) override;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_RECORDPRINTER_H
diff --git a/llvm/include/llvm/XRay/Trace.h b/llvm/include/llvm/XRay/Trace.h
index 5e4e40a..13ada22 100644
--- a/llvm/include/llvm/XRay/Trace.h
+++ b/llvm/include/llvm/XRay/Trace.h
@@ -21,8 +21,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// A Trace object represents the records that have been loaded from XRay
 /// log files generated by instrumented binaries. We encapsulate the logic of
@@ -76,7 +75,6 @@ LLVM_ABI Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
 LLVM_ABI Expected<Trace> loadTrace(const DataExtractor &Extractor,
                                    bool Sort = false);
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_TRACE_H
diff --git a/llvm/include/llvm/XRay/XRayRecord.h b/llvm/include/llvm/XRay/XRayRecord.h
index 238bf3d..8f3440c 100644
--- a/llvm/include/llvm/XRay/XRayRecord.h
+++ b/llvm/include/llvm/XRay/XRayRecord.h
@@ -18,8 +18,7 @@
 #include <vector>
 #include <string>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// XRay traces all have a header providing some top-matter information useful
 /// to help tools determine how to interpret the information available in the
@@ -98,7 +97,6 @@ struct XRayRecord {
   std::string Data;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_XRAY_XRAYRECORD_H
diff --git a/llvm/include/llvm/XRay/YAMLXRayRecord.h b/llvm/include/llvm/XRay/YAMLXRayRecord.h
index 6062606..6bf4f1d 100644
--- a/llvm/include/llvm/XRay/YAMLXRayRecord.h
+++ b/llvm/include/llvm/XRay/YAMLXRayRecord.h
@@ -17,8 +17,7 @@
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 struct YAMLXRayFileHeader {
   uint16_t Version;
@@ -46,13 +45,12 @@ struct YAMLXRayTrace {
   std::vector<YAMLXRayRecord> Records;
 };
 
-} // namespace xray
-
-namespace yaml {
+} // namespace llvm::xray
 
+namespace llvm {
 // YAML Traits
 // -----------
-template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
+template <> struct yaml::ScalarEnumerationTraits<xray::RecordTypes> {
   static void enumeration(IO &IO, xray::RecordTypes &Type) {
     IO.enumCase(Type, "function-enter", xray::RecordTypes::ENTER);
     IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
@@ -63,7 +61,7 @@ template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
   }
 };
 
-template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayFileHeader> {
   static void mapping(IO &IO, xray::YAMLXRayFileHeader &Header) {
     IO.mapRequired("version", Header.Version);
     IO.mapRequired("type", Header.Type);
@@ -73,7 +71,7 @@ template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
   }
 };
 
-template <> struct MappingTraits<xray::YAMLXRayRecord> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayRecord> {
   static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
     IO.mapRequired("type", Record.RecordType);
     IO.mapOptional("func-id", Record.FuncId);
@@ -90,7 +88,7 @@ template <> struct MappingTraits<xray::YAMLXRayRecord> {
   static constexpr bool flow = true;
 };
 
-template <> struct MappingTraits<xray::YAMLXRayTrace> {
+template <> struct yaml::MappingTraits<llvm::xray::YAMLXRayTrace> {
   static void mapping(IO &IO, xray::YAMLXRayTrace &Trace) {
     // A trace file contains two parts, the header and the list of all the
     // trace records.
@@ -98,8 +96,6 @@ template <> struct MappingTraits<xray::YAMLXRayTrace> {
     IO.mapRequired("records", Trace.Records);
   }
 };
-
-} // namespace yaml
 } // namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRayRecord)
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b5b4cd9..00c3dbb 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -5419,20 +5419,15 @@ static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
   if (SourceBits != NewBits)
     return nullptr;
 
-  const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
-  const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
-  if (!SExt && !ZExt)
-    return nullptr;
-  const SCEVTruncateExpr *Trunc =
-      SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
-           : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
-  if (!Trunc)
-    return nullptr;
-  const SCEV *X = Trunc->getOperand();
-  if (X != SymbolicPHI)
-    return nullptr;
-  Signed = SExt != nullptr;
-  return Trunc->getType();
+  if (match(Op, m_scev_SExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = true;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  if (match(Op, m_scev_ZExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = false;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  return nullptr;
 }
 
 static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
@@ -15428,20 +15423,18 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
   // Try to match 'zext (trunc A to iB) to iY', which is used
   // for URem with constant power-of-2 second operands. Make sure the size of
   // the operand A matches the size of the whole expressions.
-  if (const auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Expr))
-    if (const auto *Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0))) {
-      LHS = Trunc->getOperand();
-      // Bail out if the type of the LHS is larger than the type of the
-      // expression for now.
-      if (getTypeSizeInBits(LHS->getType()) >
-          getTypeSizeInBits(Expr->getType()))
-        return false;
-      if (LHS->getType() != Expr->getType())
-        LHS = getZeroExtendExpr(LHS, Expr->getType());
-      RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
-                        << getTypeSizeInBits(Trunc->getType()));
-      return true;
-    }
+  if (match(Expr, m_scev_ZExt(m_scev_Trunc(m_SCEV(LHS))))) {
+    Type *TruncTy = cast<SCEVZeroExtendExpr>(Expr)->getOperand()->getType();
+    // Bail out if the type of the LHS is larger than the type of the
+    // expression for now.
+    if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(Expr->getType()))
+      return false;
+    if (LHS->getType() != Expr->getType())
+      LHS = getZeroExtendExpr(LHS, Expr->getType());
+    RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
+                      << getTypeSizeInBits(TruncTy));
+    return true;
+  }
   const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
   if (Add == nullptr || Add->getNumOperands() != 2)
     return false;
diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
index b036b2d..1f751ee 100644
--- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp
+++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
@@ -6,6 +6,46 @@
 #include "llvm/ProfileData/InstrProf.h"
 
 using namespace llvm;
+
+namespace llvm {
+namespace memprof {
+// Returns true iff the global variable has custom section either by
+// __attribute__((section("name")))
+// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
+// or #pragma clang section directives
+// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
+static bool hasExplicitSectionName(const GlobalVariable &GVar) {
+  if (GVar.hasSection())
+    return true;
+
+  auto Attrs = GVar.getAttributes();
+  if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
+      Attrs.hasAttribute("relro-section") ||
+      Attrs.hasAttribute("rodata-section"))
+    return true;
+  return false;
+}
+
+AnnotationKind getAnnotationKind(const GlobalVariable &GV) {
+  if (GV.isDeclarationForLinker())
+    return AnnotationKind::DeclForLinker;
+  // Skip 'llvm.'-prefixed global variables conservatively because they are
+  // often handled specially,
+  StringRef Name = GV.getName();
+  if (Name.starts_with("llvm."))
+    return AnnotationKind::ReservedName;
+  // Respect user-specified custom data sections.
+  if (hasExplicitSectionName(GV))
+    return AnnotationKind::ExplicitSection;
+  return AnnotationKind::AnnotationOK;
+}
+
+bool IsAnnotationOK(const GlobalVariable &GV) {
+  return getAnnotationKind(GV) == AnnotationKind::AnnotationOK;
+}
+} // namespace memprof
+} // namespace llvm
+
 void StaticDataProfileInfo::addConstantProfileCount(
     const Constant *C, std::optional<uint64_t> Count) {
   if (!Count) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 6356d71..873ac8f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
-namespace llvm {
+using namespace llvm;
 
 AIXException::AIXException(AsmPrinter *A) : EHStreamer(A) {}
 
@@ -90,5 +90,3 @@ void AIXException::endFunction(const MachineFunction *MF) {
 
   emitExceptionInfoTable(LSDALabel, PerSym);
 }
-
-} // End of namespace llvm
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 260ce8f..93ae548 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -85,8 +85,7 @@ template <> struct llvm::DenseMapInfo<VariableID> {
 
 using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>;
 
-namespace std {
-template <> struct hash<VarLocInsertPt> {
+template <> struct std::hash<VarLocInsertPt> {
   using argument_type = VarLocInsertPt;
   using result_type = std::size_t;
 
@@ -94,7 +93,6 @@ template <> struct hash<VarLocInsertPt> {
     return std::hash<void *>()(Arg.getOpaqueValue());
   }
 };
-} // namespace std
 
 /// Helper class to build FunctionVarLocs, since that class isn't easy to
 /// modify. TODO: There's not a great deal of value in the split, it could be
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
index fd7df6b..47b7a88 100644
--- a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -207,9 +207,7 @@ bool ApplyCloning(MachineFunction &MF,
   }
   return AnyPathsCloned;
 }
-} // end anonymous namespace
 
-namespace llvm {
 class BasicBlockPathCloning : public MachineFunctionPass {
 public:
   static char ID;
@@ -229,7 +227,7 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
 
-} // namespace llvm
+} // namespace
 
 char BasicBlockPathCloning::ID = 0;
 INITIALIZE_PASS_BEGIN(
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 28e6728..1846880 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -31,7 +31,7 @@
 
 using namespace llvm;
 
-namespace llvm {
+namespace {
 
 class BreakFalseDeps : public MachineFunctionPass {
 private:
@@ -95,7 +95,7 @@ private:
   void processUndefReads(MachineBasicBlock *);
 };
 
-} // namespace llvm
+} // namespace
 
 #define DEBUG_TYPE "break-false-deps"
 
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 6c2a5a7..87ada87 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -126,8 +126,7 @@ hash_code hash_value(const ComplexValue &Arg) {
 } // end namespace
 typedef SmallVector<struct ComplexValue, 2> ComplexValues;
 
-namespace llvm {
-template <> struct DenseMapInfo<ComplexValue> {
+template <> struct llvm::DenseMapInfo<ComplexValue> {
   static inline ComplexValue getEmptyKey() {
     return {DenseMapInfo<Value *>::getEmptyKey(),
             DenseMapInfo<Value *>::getEmptyKey()};
@@ -144,7 +143,6 @@ template <> struct DenseMapInfo<ComplexValue> {
     return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
   }
 };
-} // end namespace llvm
 
 namespace {
 template <typename T, typename IterT>
diff --git a/llvm/lib/CodeGen/EdgeBundles.cpp b/llvm/lib/CodeGen/EdgeBundles.cpp
index f4335396..50dd66f 100644
--- a/llvm/lib/CodeGen/EdgeBundles.cpp
+++ b/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -81,13 +81,10 @@ void EdgeBundles::init() {
   }
 }
 
-namespace llvm {
-
 /// Specialize WriteGraph, the standard implementation won't work.
-template<>
-raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
-                          bool ShortNames,
-                          const Twine &Title) {
+template <>
+raw_ostream &llvm::WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+                                bool ShortNames, const Twine &Title) {
   const MachineFunction *MF = G.getMachineFunction();
 
   O << "digraph {\n";
@@ -107,8 +104,6 @@ raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
   return O;
 }
 
-} // end namespace llvm
-
 /// view - Visualize the annotated bipartite CFG with Graphviz.
 void EdgeBundles::view() const {
   ViewGraph(*this, "EdgeBundles");
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index c500357..04c7008 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -1036,6 +1036,7 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
         continue;
 
       addToWorklist(I, Worklist);
+      Modified = true;
       break;
     }
     default:
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 47640c4a..81ab317 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -587,16 +587,12 @@ public:
 } // namespace
 
 char GlobalMergeFuncPassWrapper::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMergeFuncPassWrapper, "global-merge-func",
-                      "Global merge function pass", false, false)
-INITIALIZE_PASS_END(GlobalMergeFuncPassWrapper, "global-merge-func",
-                    "Global merge function pass", false, false)
+INITIALIZE_PASS(GlobalMergeFuncPassWrapper, "global-merge-func",
+                "Global merge function pass", false, false)
 
-namespace llvm {
-ModulePass *createGlobalMergeFuncPass() {
+ModulePass *llvm::createGlobalMergeFuncPass() {
   return new GlobalMergeFuncPassWrapper();
 }
-} // namespace llvm
 
 GlobalMergeFuncPassWrapper::GlobalMergeFuncPassWrapper() : ModulePass(ID) {
   initializeGlobalMergeFuncPassWrapperPass(
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 3485a27..0e38017 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -101,15 +101,11 @@ static cl::opt<bool> EnablePrecomputePhysRegs(
 static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
-namespace llvm {
-
-cl::opt<bool> UseSegmentSetForPhysRegs(
+cl::opt<bool> llvm::UseSegmentSetForPhysRegs(
     "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
     cl::desc(
         "Use segment set for the computation of the live ranges of physregs."));
 
-} // end namespace llvm
-
 void LiveIntervalsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<LiveVariablesWrapperPass>();
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index e859765..5c78d98 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -29,20 +29,17 @@ using namespace mir2vec;
 STATISTIC(MIRVocabMissCounter,
           "Number of lookups to MIR entities not present in the vocabulary");
 
-namespace llvm {
-namespace mir2vec {
-cl::OptionCategory MIR2VecCategory("MIR2Vec Options");
+cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
 static cl::opt<std::string>
     VocabFile("mir2vec-vocab-path", cl::Optional,
               cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""),
               cl::cat(MIR2VecCategory));
-cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
-                         cl::desc("Weight for machine opcode embeddings"),
-                         cl::cat(MIR2VecCategory));
-} // namespace mir2vec
-} // namespace llvm
+cl::opt<float>
+    llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
+                             cl::desc("Weight for machine opcode embeddings"),
+                             cl::cat(MIR2VecCategory));
 
 //===----------------------------------------------------------------------===//
 // Vocabulary Implementation
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index f5146f5..d988a2a 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -40,7 +40,7 @@ cl::opt<bool> ImprovedFSDiscriminator(
     "improved-fs-discriminator", cl::Hidden, cl::init(false),
     cl::desc("New FS discriminators encoding (incompatible with the original "
              "encoding)"));
-}
+} // namespace llvm
 
 char MIRAddFSDiscriminators::ID = 0;
 
diff --git a/llvm/lib/CodeGen/MIRNamerPass.cpp b/llvm/lib/CodeGen/MIRNamerPass.cpp
index bc65700..cbf8867 100644
--- a/llvm/lib/CodeGen/MIRNamerPass.cpp
+++ b/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -23,10 +23,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-extern char &MIRNamerID;
-} // namespace llvm
-
 #define DEBUG_TYPE "mir-namer"
 
 namespace {
@@ -53,10 +49,9 @@ public:
 
     VRegRenamer Renamer(MF.getRegInfo());
 
-    unsigned BBIndex = 0;
     ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
-    for (auto &MBB : RPOT)
-      Changed |= Renamer.renameVRegs(MBB, BBIndex++);
+    for (const auto &[BBIndex, MBB] : enumerate(RPOT))
+      Changed |= Renamer.renameVRegs(MBB, BBIndex);
 
     return Changed;
   }
@@ -66,10 +61,4 @@ public:
 
 char MIRNamer::ID;
 
-char &llvm::MIRNamerID = MIRNamer::ID;
-
-INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
-                      false)
-
-INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
-                    false)
+INITIALIZE_PASS(MIRNamer, "mir-namer", "Rename Register Operands", false, false)
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bf8a6cd..96428cd 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -107,10 +107,8 @@ struct MFPrintState {
 
 } // end anonymous namespace
 
-namespace llvm::yaml {
-
 /// This struct serializes the LLVM IR module.
-template <> struct BlockScalarTraits<Module> {
+template <> struct yaml::BlockScalarTraits<Module> {
   static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
     Mod.print(OS, nullptr);
   }
@@ -121,8 +119,6 @@ template <> struct BlockScalarTraits<Module> {
   }
 };
 
-} // end namespace llvm::yaml
-
 static void printRegMIR(Register Reg, yaml::StringValue &Dest,
                         const TargetRegisterInfo *TRI) {
   raw_string_ostream OS(Dest.Value);
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index b2731b69..a72c2c4 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -97,7 +97,9 @@ static const bool EnableDevelopmentFeatures = false;
 /// this happens only in development mode. It's a no-op otherwise.
 namespace llvm {
 extern cl::opt<unsigned> EvictInterferenceCutoff;
+} // namespace llvm
 
+namespace {
 class RegAllocScoring : public MachineFunctionPass {
 public:
   static char ID;
@@ -124,11 +126,12 @@ public:
   /// Performs this pass
   bool runOnMachineFunction(MachineFunction &) override;
 };
+} // namespace
 
 char RegAllocScoring::ID = 0;
-FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); }
-
-} // namespace llvm
+FunctionPass *llvm::createRegAllocScoringPass() {
+  return new RegAllocScoring();
+}
 
 INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
                 "Register Allocation Scoring Pass", false, false)
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index e7fa082..26eb10f 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-block-freq"
 
-namespace llvm {
 static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
     "view-machine-block-freq-propagation-dags", cl::Hidden,
     cl::desc("Pop up a window to show a dag displaying how machine block "
@@ -44,6 +43,7 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
                clEnumValN(GVDT_Count, "count", "display a graph using the real "
                                                "profile count if available.")));
 
+namespace llvm {
 // Similar option above, but used to control BFI display only after MBP pass
 cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
     "view-block-layout-with-bfi", cl::Hidden,
@@ -69,15 +69,15 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=
 extern cl::opt<unsigned> ViewHotFreqPercent;
 
-static cl::opt<bool> PrintMachineBlockFreq(
-    "print-machine-bfi", cl::init(false), cl::Hidden,
-    cl::desc("Print the machine block frequency info."));
-
 // Command line option to specify the name of the function for block frequency
 // dump. Defined in Analysis/BlockFrequencyInfo.cpp.
 extern cl::opt<std::string> PrintBFIFuncName;
 } // namespace llvm
 
+static cl::opt<bool>
+    PrintMachineBlockFreq("print-machine-bfi", cl::init(false), cl::Hidden,
+                          cl::desc("Print the machine block frequency info."));
+
 static GVDAGType getGVDT() {
   if (ViewBlockLayoutWithBFI != GVDT_None)
     return ViewBlockLayoutWithBFI;
@@ -85,9 +85,7 @@ static GVDAGType getGVDT() {
   return ViewMachineBlockFreqPropagationDAG;
 }
 
-namespace llvm {
-
-template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
+template <> struct llvm::GraphTraits<MachineBlockFrequencyInfo *> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
   using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>;
@@ -116,7 +114,7 @@ using MBFIDOTGraphTraitsBase =
                           MachineBranchProbabilityInfo>;
 
 template <>
-struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+struct llvm::DOTGraphTraits<MachineBlockFrequencyInfo *>
     : public MBFIDOTGraphTraitsBase {
   const MachineFunction *CurFunc = nullptr;
   DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
@@ -159,8 +157,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
   }
 };
 
-} // end namespace llvm
-
 AnalysisKey MachineBlockFrequencyAnalysis::Key;
 
 MachineBlockFrequencyAnalysis::Result
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 2e92dd8..7ca4582 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -18,13 +18,8 @@
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfoWrapperPass,
-                      "machine-branch-prob",
-                      "Machine Branch Probability Analysis", false, true)
-INITIALIZE_PASS_END(MachineBranchProbabilityInfoWrapperPass,
-                    "machine-branch-prob",
-                    "Machine Branch Probability Analysis", false, true)
-
+INITIALIZE_PASS(MachineBranchProbabilityInfoWrapperPass, "machine-branch-prob",
+                "Machine Branch Probability Analysis", false, true)
 namespace llvm {
 cl::opt<unsigned>
     StaticLikelyProb("static-likely-prob",
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 224231c..bfa5ab2 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -719,43 +719,41 @@ MachineFunction::CallSiteInfo::CallSiteInfo(const CallBase &CB) {
   }
 }
 
-namespace llvm {
+template <>
+struct llvm::DOTGraphTraits<const MachineFunction *>
+    : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-  template<>
-  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
-    DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+  static std::string getGraphName(const MachineFunction *F) {
+    return ("CFG for '" + F->getName() + "' function").str();
+  }
 
-    static std::string getGraphName(const MachineFunction *F) {
-      return ("CFG for '" + F->getName() + "' function").str();
+  std::string getNodeLabel(const MachineBasicBlock *Node,
+                           const MachineFunction *Graph) {
+    std::string OutStr;
+    {
+      raw_string_ostream OSS(OutStr);
+
+      if (isSimple()) {
+        OSS << printMBBReference(*Node);
+        if (const BasicBlock *BB = Node->getBasicBlock())
+          OSS << ": " << BB->getName();
+      } else
+        Node->print(OSS);
     }
 
-    std::string getNodeLabel(const MachineBasicBlock *Node,
-                             const MachineFunction *Graph) {
-      std::string OutStr;
-      {
-        raw_string_ostream OSS(OutStr);
-
-        if (isSimple()) {
-          OSS << printMBBReference(*Node);
-          if (const BasicBlock *BB = Node->getBasicBlock())
-            OSS << ": " << BB->getName();
-        } else
-          Node->print(OSS);
-      }
-
-      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
-
-      // Process string output to make it nicer...
-      for (unsigned i = 0; i != OutStr.length(); ++i)
-        if (OutStr[i] == '\n') {                            // Left justify
-          OutStr[i] = '\\';
-          OutStr.insert(OutStr.begin()+i+1, 'l');
-        }
-      return OutStr;
-    }
-  };
+    if (OutStr[0] == '\n')
+      OutStr.erase(OutStr.begin());
 
-} // end namespace llvm
+    // Process string output to make it nicer...
+    for (unsigned i = 0; i != OutStr.length(); ++i)
+      if (OutStr[i] == '\n') { // Left justify
+        OutStr[i] = '\\';
+        OutStr.insert(OutStr.begin() + i + 1, 'l');
+      }
+    return OutStr;
+  }
+};
 
 void MachineFunction::viewCFG() const
 {
diff --git a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0f88a7b..5111322 100644
--- a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -60,13 +60,11 @@ char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
 INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
                 "Machine Function Printer", false, false)
 
-namespace llvm {
 /// Returns a newly-created MachineFunction Printer pass. The
 /// default banner is empty.
 ///
-MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
-                                                      const std::string &Banner){
+MachineFunctionPass *
+llvm::createMachineFunctionPrinterPass(raw_ostream &OS,
+                                       const std::string &Banner) {
   return new MachineFunctionPrinterPass(OS, Banner);
 }
-
-}
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index fdae3b4..9feb974 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -593,15 +593,12 @@ struct MachineOutliner : public ModulePass {
 
 char MachineOutliner::ID = 0;
 
-namespace llvm {
-ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
+ModulePass *llvm::createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
   MachineOutliner *OL = new MachineOutliner();
   OL->RunOutlinerMode = RunOutlinerMode;
   return OL;
 }
 
-} // namespace llvm
-
 INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
                 false)
 
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 89ed4da..a717d9e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -201,16 +201,15 @@ static cl::opt<unsigned> SwpMaxNumStores(
     cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
     cl::init(200));
 
-namespace llvm {
-
 // A command line option to enable the CopyToPhi DAG mutation.
-cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
-                                 cl::init(true),
-                                 cl::desc("Enable CopyToPhi DAG Mutation"));
+cl::opt<bool>
+    llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                             cl::init(true),
+                             cl::desc("Enable CopyToPhi DAG Mutation"));
 
 /// A command line argument to force pipeliner to use specified issue
 /// width.
-cl::opt<int> SwpForceIssueWidth(
+cl::opt<int> llvm::SwpForceIssueWidth(
     "pipeliner-force-issue-width",
     cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
     cl::init(-1));
@@ -226,8 +225,6 @@ static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
                clEnumValN(WindowSchedulingFlag::WS_Force, "force",
                           "Use window algorithm instead of SMS algorithm.")));
 
-} // end namespace llvm
-
 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
 char MachinePipeliner::ID = 0;
 #ifndef NDEBUG
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 299bcc4..3ed1045 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -176,9 +176,7 @@ STATISTIC(NumNodeOrderPostRA,
 STATISTIC(NumFirstValidPostRA,
           "Number of scheduling units chosen for FirstValid heuristic post-RA");
 
-namespace llvm {
-
-cl::opt<MISched::Direction> PreRADirection(
+cl::opt<MISched::Direction> llvm::PreRADirection(
     "misched-prera-direction", cl::Hidden,
     cl::desc("Pre reg-alloc list scheduling direction"),
     cl::init(MISched::Unspecified),
@@ -206,33 +204,31 @@ static cl::opt<bool>
     DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                            cl::desc("Print critical path length to stdout"));
 
-cl::opt<bool> VerifyScheduling(
+cl::opt<bool> llvm::VerifyScheduling(
     "verify-misched", cl::Hidden,
     cl::desc("Verify machine instrs before and after machine scheduling"));
 
 #ifndef NDEBUG
-cl::opt<bool> ViewMISchedDAGs(
+cl::opt<bool> llvm::ViewMISchedDAGs(
     "view-misched-dags", cl::Hidden,
     cl::desc("Pop up a window to show MISched dags after they are processed"));
-cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
-                        cl::desc("Print schedule DAGs"));
-cl::opt<bool> MISchedDumpReservedCycles(
+cl::opt<bool> llvm::PrintDAGs("misched-print-dags", cl::Hidden,
+                              cl::desc("Print schedule DAGs"));
+static cl::opt<bool> MISchedDumpReservedCycles(
     "misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
     cl::desc("Dump resource usage at schedule boundary."));
-cl::opt<bool> MischedDetailResourceBooking(
+static cl::opt<bool> MischedDetailResourceBooking(
     "misched-detail-resource-booking", cl::Hidden, cl::init(false),
     cl::desc("Show details of invoking getNextResoufceCycle."));
 #else
-const bool ViewMISchedDAGs = false;
-const bool PrintDAGs = false;
-const bool MischedDetailResourceBooking = false;
+const bool llvm::ViewMISchedDAGs = false;
+const bool llvm::PrintDAGs = false;
+static const bool MischedDetailResourceBooking = false;
 #ifdef LLVM_ENABLE_DUMP
-const bool MISchedDumpReservedCycles = false;
+static const bool MISchedDumpReservedCycles = false;
 #endif // LLVM_ENABLE_DUMP
 #endif // NDEBUG
 
-} // end namespace llvm
-
 #ifndef NDEBUG
 /// In some situations a few uninteresting nodes depend on nearly all other
 /// nodes in the graph, provide a cutoff to hide them.
@@ -2053,28 +2049,24 @@ public:
 
 } // end anonymous namespace
 
-namespace llvm {
-
 std::unique_ptr<ScheduleDAGMutation>
-createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI,
-                             bool ReorderWhileClustering) {
+llvm::createLoadClusterDAGMutation(const TargetInstrInfo *TII,
+                                   const TargetRegisterInfo *TRI,
+                                   bool ReorderWhileClustering) {
   return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
                                   TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
-createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI,
-                              bool ReorderWhileClustering) {
+llvm::createStoreClusterDAGMutation(const TargetInstrInfo *TII,
+                                    const TargetRegisterInfo *TRI,
+                                    bool ReorderWhileClustering) {
   return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
                                   TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
-} // end namespace llvm
-
 // Sorting all the loads/stores first, then for each load/store, checking the
 // following load/store one by one, until reach the first non-dependent one and
 // call target hook to see if they can cluster.
@@ -2304,16 +2296,12 @@ protected:
 
 } // end anonymous namespace
 
-namespace llvm {
-
 std::unique_ptr<ScheduleDAGMutation>
-createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
-                               const TargetRegisterInfo *TRI) {
+llvm::createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
+                                     const TargetRegisterInfo *TRI) {
   return std::make_unique<CopyConstrain>(TII, TRI);
 }
 
-} // end namespace llvm
-
 /// constrainLocalCopy handles two possibilities:
 /// 1) Local src:
 /// I0:     = dst
@@ -3445,14 +3433,13 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
 }
 #endif
 
-namespace llvm {
 /// Return true if this heuristic determines order.
 /// TODO: Consider refactor return type of these functions as integer or enum,
 /// as we may need to differentiate whether TryCand is better than Cand.
-bool tryLess(int TryVal, int CandVal,
-             GenericSchedulerBase::SchedCandidate &TryCand,
-             GenericSchedulerBase::SchedCandidate &Cand,
-             GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryLess(int TryVal, int CandVal,
+                   GenericSchedulerBase::SchedCandidate &TryCand,
+                   GenericSchedulerBase::SchedCandidate &Cand,
+                   GenericSchedulerBase::CandReason Reason) {
   if (TryVal < CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -3465,10 +3452,10 @@ bool tryLess(int TryVal, int CandVal,
   return false;
 }
 
-bool tryGreater(int TryVal, int CandVal,
-                GenericSchedulerBase::SchedCandidate &TryCand,
-                GenericSchedulerBase::SchedCandidate &Cand,
-                GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryGreater(int TryVal, int CandVal,
+                      GenericSchedulerBase::SchedCandidate &TryCand,
+                      GenericSchedulerBase::SchedCandidate &Cand,
+                      GenericSchedulerBase::CandReason Reason) {
   if (TryVal > CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -3481,9 +3468,9 @@ bool tryGreater(int TryVal, int CandVal,
   return false;
 }
 
-bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
-                GenericSchedulerBase::SchedCandidate &Cand,
-                SchedBoundary &Zone) {
+bool llvm::tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+                      GenericSchedulerBase::SchedCandidate &Cand,
+                      SchedBoundary &Zone) {
   if (Zone.isTop()) {
     // Prefer the candidate with the lesser depth, but only if one of them has
     // depth greater than the total latency scheduled so far, otherwise either
@@ -3513,7 +3500,6 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
   }
   return false;
 }
-} // end namespace llvm
 
 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop,
                       bool IsPostRA = false) {
@@ -3798,14 +3784,12 @@ void GenericScheduler::registerRoots() {
   }
 }
 
-namespace llvm {
-bool tryPressure(const PressureChange &TryP,
-                 const PressureChange &CandP,
-                 GenericSchedulerBase::SchedCandidate &TryCand,
-                 GenericSchedulerBase::SchedCandidate &Cand,
-                 GenericSchedulerBase::CandReason Reason,
-                 const TargetRegisterInfo *TRI,
-                 const MachineFunction &MF) {
+bool llvm::tryPressure(const PressureChange &TryP, const PressureChange &CandP,
+                       GenericSchedulerBase::SchedCandidate &TryCand,
+                       GenericSchedulerBase::SchedCandidate &Cand,
+                       GenericSchedulerBase::CandReason Reason,
+                       const TargetRegisterInfo *TRI,
+                       const MachineFunction &MF) {
   // If one candidate decreases and the other increases, go with it.
   // Invalid candidates have UnitInc==0.
   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
@@ -3838,7 +3822,7 @@ bool tryPressure(const PressureChange &TryP,
   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
 }
 
-unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+unsigned llvm::getWeakLeft(const SUnit *SU, bool isTop) {
   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
 }
 
@@ -3849,7 +3833,7 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) {
 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
 /// with the operation that produces or consumes the physreg. We'll do this when
 /// regalloc has support for parallel copies.
-int biasPhysReg(const SUnit *SU, bool isTop) {
+int llvm::biasPhysReg(const SUnit *SU, bool isTop) {
   const MachineInstr *MI = SU->getInstr();
 
   if (MI->isCopy()) {
@@ -3884,7 +3868,6 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
 
   return 0;
 }
-} // end namespace llvm
 
 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
@@ -4812,13 +4795,13 @@ static MachineSchedRegistry ShufflerRegistry(
 //===----------------------------------------------------------------------===//
 
 #ifndef NDEBUG
-namespace llvm {
 
-template<> struct GraphTraits<
-  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+template <>
+struct llvm::GraphTraits<ScheduleDAGMI *> : public GraphTraits<ScheduleDAG *> {
+};
 
-template<>
-struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAGMI *> : public DefaultDOTGraphTraits {
   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const ScheduleDAG *G) {
@@ -4878,7 +4861,6 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   }
 };
 
-} // end namespace llvm
 #endif // NDEBUG
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index c2d4aa0..9ac3f741 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -485,10 +485,7 @@ struct LoopBounds {
 
 // Specialize po_iterator_storage in order to prune the post-order traversal so
 // it is limited to the current loop and doesn't traverse the loop back edges.
-namespace llvm {
-
-template<>
-class po_iterator_storage<LoopBounds, true> {
+template <> class llvm::po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
 
 public:
@@ -519,8 +516,6 @@ public:
   }
 };
 
-} // end namespace llvm
-
 /// Compute the trace through MBB.
 void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Computing " << getName() << " trace through "
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index 087ac62..59c587c 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -9,7 +9,7 @@
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
 #include "llvm/ADT/STLExtras.h"
 
-namespace llvm {
+using namespace llvm;
 
 DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
   auto I = Strings.try_emplace(S);
@@ -43,5 +43,3 @@ NonRelocatableStringpool::getEntriesForEmission() const {
   });
   return Result;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 6f373a5..e9ffa85 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -76,8 +76,6 @@ using namespace llvm::safestack;
 
 #define DEBUG_TYPE "safe-stack"
 
-namespace llvm {
-
 STATISTIC(NumFunctions, "Total number of functions");
 STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
 STATISTIC(NumUnsafeStackRestorePointsFunctions,
@@ -89,8 +87,6 @@ STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
 STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
 STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
 
-} // namespace llvm
-
 /// Use __safestack_pointer_address even if the platform has a faster way of
 /// access safe stack pointer.
 static cl::opt<bool>
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index eae2e8c..3268c26 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1551,14 +1551,10 @@ LLVM_DUMP_METHOD void ILPValue::dump() const {
   dbgs() << *this << '\n';
 }
 
-namespace llvm {
-
 LLVM_ATTRIBUTE_UNUSED
-raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const ILPValue &Val) {
   Val.print(OS);
   return OS;
 }
 
-} // end namespace llvm
-
 #endif
diff --git a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index e7b1494..c80eade 100644
--- a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -16,57 +16,51 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace llvm {
-  template<>
-  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAG *> : public DefaultDOTGraphTraits {
 
-  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-    static std::string getGraphName(const ScheduleDAG *G) {
-      return std::string(G->MF.getName());
-    }
+  static std::string getGraphName(const ScheduleDAG *G) {
+    return std::string(G->MF.getName());
+  }
 
-    static bool renderGraphFromBottomUp() {
-      return true;
-    }
+  static bool renderGraphFromBottomUp() { return true; }
 
-    static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
-      return (Node->NumPreds > 10 || Node->NumSuccs > 10);
-    }
+  static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
+    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+  }
 
-    static std::string getNodeIdentifierLabel(const SUnit *Node,
-                                              const ScheduleDAG *Graph) {
-      std::string R;
-      raw_string_ostream OS(R);
-      OS << static_cast<const void *>(Node);
-      return R;
-    }
+  static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                            const ScheduleDAG *Graph) {
+    std::string R;
+    raw_string_ostream OS(R);
+    OS << static_cast<const void *>(Node);
+    return R;
+  }
 
-    /// If you want to override the dot attributes printed for a particular
-    /// edge, override this method.
-    static std::string getEdgeAttributes(const SUnit *Node,
-                                         SUnitIterator EI,
-                                         const ScheduleDAG *Graph) {
-      if (EI.isArtificialDep())
-        return "color=cyan,style=dashed";
-      if (EI.isCtrlDep())
-        return "color=blue,style=dashed";
-      return "";
-    }
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+                                       const ScheduleDAG *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
 
+  std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
+  static std::string getNodeAttributes(const SUnit *N,
+                                       const ScheduleDAG *Graph) {
+    return "shape=Mrecord";
+  }
 
-    std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
-    static std::string getNodeAttributes(const SUnit *N,
-                                         const ScheduleDAG *Graph) {
-      return "shape=Mrecord";
-    }
-
-    static void addCustomGraphFeatures(ScheduleDAG *G,
-                                       GraphWriter<ScheduleDAG*> &GW) {
-      return G->addCustomGraphFeatures(GW);
-    }
-  };
-}
+  static void addCustomGraphFeatures(ScheduleDAG *G,
+                                     GraphWriter<ScheduleDAG *> &GW) {
+    return G->addCustomGraphFeatures(GW);
+  }
+};
 
 std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
                                                        const ScheduleDAG *G) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b1accdd..e153842 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -509,6 +509,7 @@ namespace {
     SDValue visitFMUL(SDNode *N);
     template <class MatchContextClass> SDValue visitFMA(SDNode *N);
     SDValue visitFMAD(SDNode *N);
+    SDValue visitFMULADD(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
     SDValue visitFSQRT(SDNode *N);
@@ -1991,6 +1992,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FMUL:               return visitFMUL(N);
   case ISD::FMA:                return visitFMA<EmptyMatchContext>(N);
   case ISD::FMAD:               return visitFMAD(N);
+  case ISD::FMULADD:            return visitFMULADD(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
   case ISD::FSQRT:              return visitFSQRT(N);
@@ -18444,6 +18446,21 @@ SDValue DAGCombiner::visitFMAD(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFMULADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Constant fold FMULADD.
+  if (SDValue C =
+          DAG.FoldConstantArithmetic(ISD::FMULADD, DL, VT, {N0, N1, N2}))
+    return C;
+
+  return SDValue();
+}
+
 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
 // reciprocal.
 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2b8dd60..4512c5c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5786,6 +5786,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FCOPYSIGN:
   case ISD::FMA:
   case ISD::FMAD:
+  case ISD::FMULADD:
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
@@ -5904,6 +5905,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
   case ISD::FCOSH:
   case ISD::FTANH:
   case ISD::FMA:
+  case ISD::FMULADD:
   case ISD::FMAD: {
     if (SNaN)
       return true;
@@ -7231,7 +7233,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   }
 
   // Handle fma/fmad special cases.
-  if (Opcode == ISD::FMA || Opcode == ISD::FMAD) {
+  if (Opcode == ISD::FMA || Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
     assert(Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
            Ops[2].getValueType() == VT && "FMA types must match!");
@@ -7242,7 +7244,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
       APFloat V1 = C1->getValueAPF();
       const APFloat &V2 = C2->getValueAPF();
       const APFloat &V3 = C3->getValueAPF();
-      if (Opcode == ISD::FMAD) {
+      if (Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
         V1.multiply(V2, APFloat::rmNearestTiesToEven);
         V1.add(V3, APFloat::rmNearestTiesToEven);
       } else
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c21890a..0f2b518 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6996,6 +6996,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                getValue(I.getArgOperand(0)),
                                getValue(I.getArgOperand(1)),
                                getValue(I.getArgOperand(2)), Flags));
+    } else if (TLI.isOperationLegalOrCustom(ISD::FMULADD, VT)) {
+      // TODO: Support splitting the vector.
+      setValue(&I, DAG.getNode(ISD::FMULADD, sdl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)), Flags));
     } else {
       // TODO: Intrinsic calls should have fast-math-flags.
       SDValue Mul = DAG.getNode(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fcfbfe6..39cbfad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FMA:                        return "fma";
   case ISD::STRICT_FMA:                 return "strict_fma";
   case ISD::FMAD:                       return "fmad";
+  case ISD::FMULADD:                    return "fmuladd";
   case ISD::FREM:                       return "frem";
   case ISD::STRICT_FREM:                return "strict_frem";
   case ISD::FCOPYSIGN:                  return "fcopysign";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cc503d3..920dff9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7676,6 +7676,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     break;
   }
   case ISD::FMA:
+  case ISD::FMULADD:
   case ISD::FMAD: {
     if (!Flags.hasNoSignedZeros())
       break;
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 64e5cd5..95a9c3f 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -306,10 +306,7 @@ char &llvm::StackFrameLayoutAnalysisPassID = StackFrameLayoutAnalysisLegacy::ID;
 INITIALIZE_PASS(StackFrameLayoutAnalysisLegacy, "stack-frame-layout",
                 "Stack Frame Layout", false, false)
 
-namespace llvm {
 /// Returns a newly-created StackFrameLayout pass.
-MachineFunctionPass *createStackFrameLayoutAnalysisPass() {
+MachineFunctionPass *llvm::createStackFrameLayoutAnalysisPass() {
   return new StackFrameLayoutAnalysisLegacy();
 }
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/StaticDataAnnotator.cpp b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
index 53a9ab4..eac20120 100644
--- a/llvm/lib/CodeGen/StaticDataAnnotator.cpp
+++ b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
@@ -75,22 +75,11 @@ bool StaticDataAnnotator::runOnModule(Module &M) {
 
   bool Changed = false;
   for (auto &GV : M.globals()) {
-    if (GV.isDeclarationForLinker())
+    if (!llvm::memprof::IsAnnotationOK(GV))
       continue;
 
-    // The implementation below assumes prior passes don't set section prefixes,
-    // and specifically do 'assign' rather than 'update'. So report error if a
-    // section prefix is already set.
-    if (auto maybeSectionPrefix = GV.getSectionPrefix();
-        maybeSectionPrefix && !maybeSectionPrefix->empty())
-      llvm::report_fatal_error("Global variable " + GV.getName() +
-                               " already has a section prefix " +
-                               *maybeSectionPrefix);
-
     StringRef SectionPrefix = SDPI->getConstantSectionPrefix(&GV, PSI);
-    if (SectionPrefix.empty())
-      continue;
-
+    // setSectionPrefix returns true if the section prefix is updated.
     Changed |= GV.setSectionPrefix(SectionPrefix);
   }
 
diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp
index e22dc25..1593a40 100644
--- a/llvm/lib/CodeGen/StaticDataSplitter.cpp
+++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp
@@ -130,10 +130,8 @@ StaticDataSplitter::getConstant(const MachineOperand &Op,
   if (Op.isGlobal()) {
     // Find global variables with local linkage.
     const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal());
-    // Skip 'llvm.'-prefixed global variables conservatively because they are
-    // often handled specially, and skip those not in static data
-    // sections.
-    if (!GV || GV->getName().starts_with("llvm.") ||
+    // Skip those not eligible for annotation or not in static data sections.
+    if (!GV || !llvm::memprof::IsAnnotationOK(*GV) ||
         !inStaticDataSection(*GV, TM))
       return nullptr;
     return GV;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c23281a..060b1dd 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -815,7 +815,8 @@ void TargetLoweringBase::initActions() {
                         ISD::FTAN,           ISD::FACOS,
                         ISD::FASIN,          ISD::FATAN,
                         ISD::FCOSH,          ISD::FSINH,
-                        ISD::FTANH,          ISD::FATAN2},
+                        ISD::FTANH,          ISD::FATAN2,
+                        ISD::FMULADD},
                        VT, Expand);
 
     // Overflow operations default to expand
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index c9e4618..971f822 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -102,10 +102,8 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
   return true;
 }
 
-namespace llvm {
-
-Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
-                   unsigned SubIdx, const MachineRegisterInfo *MRI) {
+Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI,
+                         unsigned SubIdx, const MachineRegisterInfo *MRI) {
   return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
     if (!Reg)
       OS << "$noreg";
@@ -135,7 +133,7 @@ Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
   });
 }
 
-Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
     // Generic printout when TRI is missing.
     if (!TRI) {
@@ -158,7 +156,7 @@ Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
     if (Register::isVirtualRegister(Unit)) {
       OS << '%' << Register(Unit).virtRegIndex();
@@ -168,8 +166,9 @@ Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
-                              const TargetRegisterInfo *TRI) {
+Printable llvm::printRegClassOrBank(Register Reg,
+                                    const MachineRegisterInfo &RegInfo,
+                                    const TargetRegisterInfo *TRI) {
   return Printable([Reg, &RegInfo, TRI](raw_ostream &OS) {
     if (RegInfo.getRegClassOrNull(Reg))
       OS << StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
@@ -183,8 +182,6 @@ Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
   });
 }
 
-} // end namespace llvm
-
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index e9c058e..5b87686 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -528,3 +528,147 @@ void ConstantFPRange::flushDenormals(DenormalMode::DenormalModeKind Mode) {
   Lower = minnum(Lower, APFloat::getZero(Sem, ZeroLowerNegative));
   Upper = maxnum(Upper, APFloat::getZero(Sem, ZeroUpperNegative));
 }
+
+/// Represent a contiguous range of values sharing the same sign.
+struct SameSignRange {
+  bool HasZero;
+  bool HasNonZero;
+  bool HasInf;
+  // The lower and upper bounds of the range (inclusive).
+  // The sign is dropped and infinities are excluded.
+  std::optional<std::pair<APFloat, APFloat>> FinitePart;
+
+  explicit SameSignRange(const APFloat &Lower, const APFloat &Upper)
+      : HasZero(Lower.isZero()), HasNonZero(!Upper.isZero()),
+        HasInf(Upper.isInfinity()) {
+    assert(!Lower.isNegative() && !Upper.isNegative() &&
+           "The sign should be dropped.");
+    assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+           "Empty set.");
+    if (!Lower.isInfinity())
+      FinitePart = {Lower,
+                    HasInf ? APFloat::getLargest(Lower.getSemantics()) : Upper};
+  }
+};
+
+/// Split the range into positive and negative components.
+static void splitPosNeg(const APFloat &Lower, const APFloat &Upper,
+                        std::optional<SameSignRange> &NegPart,
+                        std::optional<SameSignRange> &PosPart) {
+  assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+         "Non-NaN part is empty.");
+  if (Lower.isNegative() == Upper.isNegative()) {
+    if (Lower.isNegative())
+      NegPart = SameSignRange{abs(Upper), abs(Lower)};
+    else
+      PosPart = SameSignRange{Lower, Upper};
+    return;
+  }
+  auto &Sem = Lower.getSemantics();
+  NegPart = SameSignRange{APFloat::getZero(Sem), abs(Lower)};
+  PosPart = SameSignRange{APFloat::getZero(Sem), Upper};
+}
+
+ConstantFPRange ConstantFPRange::mul(const ConstantFPRange &Other) const {
+  auto &Sem = getSemantics();
+  bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+                      ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+  if (isNaNOnly() || Other.isNaNOnly())
+    return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+                      /*MayBeSNaN=*/false);
+  std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+  splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+  splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+  APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+  APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+  auto Update = [&](std::optional<SameSignRange> &LHS,
+                    std::optional<SameSignRange> &RHS, bool Negative) {
+    if (!LHS || !RHS)
+      return;
+    // 0 * inf = QNaN
+    ResMayBeQNaN |= LHS->HasZero && RHS->HasInf;
+    ResMayBeQNaN |= RHS->HasZero && LHS->HasInf;
+    // NonZero * inf = inf
+    if ((LHS->HasInf && RHS->HasNonZero) || (RHS->HasInf && LHS->HasNonZero))
+      (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+    // Finite * Finite
+    if (LHS->FinitePart && RHS->FinitePart) {
+      APFloat NewLower = LHS->FinitePart->first * RHS->FinitePart->first;
+      APFloat NewUpper = LHS->FinitePart->second * RHS->FinitePart->second;
+      if (Negative) {
+        ResLower = minnum(ResLower, -NewUpper);
+        ResUpper = maxnum(ResUpper, -NewLower);
+      } else {
+        ResLower = minnum(ResLower, NewLower);
+        ResUpper = maxnum(ResUpper, NewUpper);
+      }
+    }
+  };
+  Update(LHSNeg, RHSNeg, /*Negative=*/false);
+  Update(LHSNeg, RHSPos, /*Negative=*/true);
+  Update(LHSPos, RHSNeg, /*Negative=*/true);
+  Update(LHSPos, RHSPos, /*Negative=*/false);
+  return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
+
+ConstantFPRange ConstantFPRange::div(const ConstantFPRange &Other) const {
+  auto &Sem = getSemantics();
+  bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+                      ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+  if (isNaNOnly() || Other.isNaNOnly())
+    return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+                      /*MayBeSNaN=*/false);
+  std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+  splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+  splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+  APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+  APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+  auto Update = [&](std::optional<SameSignRange> &LHS,
+                    std::optional<SameSignRange> &RHS, bool Negative) {
+    if (!LHS || !RHS)
+      return;
+    // inf / inf = QNaN 0 / 0 = QNaN
+    ResMayBeQNaN |= LHS->HasInf && RHS->HasInf;
+    ResMayBeQNaN |= LHS->HasZero && RHS->HasZero;
+    // It is not straightforward to infer HasNonZeroFinite = HasFinite &&
+    // HasNonZero. By definitions we have:
+    //   HasFinite = HasNonZeroFinite || HasZero
+    //   HasNonZero = HasNonZeroFinite || HasInf
+    // Since the range is contiguous, if both HasFinite and HasNonZero are true,
+    // HasNonZeroFinite must be true.
+    bool LHSHasNonZeroFinite = LHS->FinitePart && LHS->HasNonZero;
+    bool RHSHasNonZeroFinite = RHS->FinitePart && RHS->HasNonZero;
+    // inf / Finite = inf FiniteNonZero / 0 = inf
+    if ((LHS->HasInf && RHS->FinitePart) ||
+        (LHSHasNonZeroFinite && RHS->HasZero))
+      (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+    // Finite / inf = 0
+    if (LHS->FinitePart && RHS->HasInf) {
+      APFloat Zero = APFloat::getZero(Sem, /*Negative=*/Negative);
+      ResLower = minnum(ResLower, Zero);
+      ResUpper = maxnum(ResUpper, Zero);
+    }
+    // Finite / FiniteNonZero
+    if (LHS->FinitePart && RHSHasNonZeroFinite) {
+      assert(!RHS->FinitePart->second.isZero() &&
+             "Divisor should be non-zero.");
+      APFloat NewLower = LHS->FinitePart->first / RHS->FinitePart->second;
+      APFloat NewUpper = LHS->FinitePart->second /
+                         (RHS->FinitePart->first.isZero()
+                              ? APFloat::getSmallest(Sem, /*Negative=*/false)
+                              : RHS->FinitePart->first);
+      if (Negative) {
+        ResLower = minnum(ResLower, -NewUpper);
+        ResUpper = maxnum(ResUpper, -NewLower);
+      } else {
+        ResLower = minnum(ResLower, NewLower);
+        ResUpper = maxnum(ResUpper, NewUpper);
+      }
+    }
+  };
+  Update(LHSNeg, RHSNeg, /*Negative=*/false);
+  Update(LHSNeg, RHSPos, /*Negative=*/true);
+  Update(LHSPos, RHSNeg, /*Negative=*/true);
+  Update(LHSPos, RHSPos, /*Negative=*/false);
+  return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 614c3a9..15c0198 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -1002,6 +1003,18 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
   return C;
 }
 
+Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
+                                                     Value *False,
+                                                     StringRef PassName,
+                                                     const Twine &Name) {
+  Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
+  if (auto *SI = dyn_cast<SelectInst>(Ret)) {
+    setExplicitlyUnknownBranchWeightsIfProfiled(
+        *SI, *SI->getParent()->getParent(), PassName);
+  }
+  return Ret;
+}
+
 Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
                                    const Twine &Name, Instruction *MDFrom) {
   return CreateSelectFMF(C, True, False, {}, Name, MDFrom);
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 2ea3a24..afce803 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1363,9 +1363,12 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
   }
   case LISTSPLAT: {
     const auto *Value = dyn_cast<TypedInit>(LHS);
-    const auto *Size = dyn_cast<IntInit>(RHS);
-    if (Value && Size) {
-      SmallVector<const Init *, 8> Args(Size->getValue(), Value);
+    const auto *Count = dyn_cast<IntInit>(RHS);
+    if (Value && Count) {
+      if (Count->getValue() < 0)
+        PrintFatalError(Twine("!listsplat count ") + Count->getAsString() +
+                        " is negative");
+      SmallVector<const Init *, 8> Args(Count->getValue(), Value);
       return ListInit::get(Args, Value->getType());
     }
     break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1..5700468 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
             TII->isTRANS(MI)))
-    Result = true;
+    Result = !MI.mayLoadOrStore();
 
   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
-           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
-    Result = true;
+           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+    // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+    // For our purposes, these shall not be classified as VALU as this results
+    // in unexpected behavior.
+    Result = !MI.mayLoadOrStore();
+  }
 
   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
            TII->isSALU(MI))
-    Result = true;
+    Result = !MI.mayLoadOrStore();
 
   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
            TII->isMFMAorWMMA(MI))
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 64e34db..5f6d742 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -260,8 +260,12 @@ class NSAHelper {
 }
 
 class MIMGNSAHelper<int num_addrs,
-                    list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
-  : NSAHelper<> {
+                    list<RegisterOperand> addr_types_in=[]>
+    : NSAHelper<> {
+  list<RegisterOperand> addr_types =
+    !if(!empty(addr_types_in), !listsplat(VGPROp_32, num_addrs),
+        addr_types_in);
+
   list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i);
   let AddrIns = !dag(ins, addr_types, AddrAsmNames);
   let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
@@ -358,7 +362,7 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
 // Base class for all NSA MIMG instructions.
 // Note that 1-dword addresses always use non-NSA variants.
 class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
-                     list<RegisterClass> addr_types=[],
+                     list<RegisterOperand> addr_types=[],
                      RegisterOperand LastAddrRC = VGPROp_32>
   : MIMG<outs, dns>, MIMGe_gfx11<op> {
   let SubtargetPredicate = isGFX11Only;
@@ -378,7 +382,7 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
 }
 
 class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
-                   list<RegisterClass> addr_types=[]>
+                   list<RegisterOperand> addr_types=[]>
   : VIMAGE<outs, dns>, VIMAGEe<op> {
   let SubtargetPredicate = isGFX12Plus;
   let AssemblerPredicate = isGFX12Plus;
@@ -1521,12 +1525,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
   int VAddrDwords = !srl(Size, 5);
 
   int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
-  RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
-  list<RegisterClass> GFX11PlusAddrTypes =
-     !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32],
-           isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64],
-           IsA16  : [node_ptr_type, VGPR_32, VReg_96, VReg_96],
-           true   : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+  RegisterOperand node_ptr_type = !if(Is64, VGPROp_64, VGPROp_32);
+  list<RegisterOperand> GFX11PlusAddrTypes =
+     !cond(isBVH8 : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_32],
+           isDual : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_64],
+           IsA16  : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96],
+           true   : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96, VGPROp_96]);
 }
 
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC>
@@ -1552,7 +1556,7 @@ class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC>
 }
 
 class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
-                                  list<RegisterClass> addr_types>
+                                  list<RegisterOperand> addr_types>
     : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
                      addr_types> {
   let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
@@ -1561,7 +1565,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
 
 class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
                                 bit isDual, bit isBVH8,
-                                list<RegisterClass> addr_types>
+                                list<RegisterOperand> addr_types>
     : VIMAGE_gfx12<op.GFX12, !if(!or(isDual, isBVH8),
                                  (outs VReg_320:$vdata, VReg_96:$ray_origin_out,
                                        VReg_96:$ray_dir_out),
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4b54231..8851a0f 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1659,6 +1659,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -1, (1 << 5) - 1,
         "immediate must be non-zero in the range");
+  case Match_InvalidXSfmmVType: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return generateXSfmmVTypeError(ErrorLoc);
+  }
   case Match_InvalidVTypeI: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return generateVTypeError(ErrorLoc);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 70b7c43..e75dfe3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -142,6 +142,22 @@ enum {
 
   ReadsPastVLShift = DestEEWShift + 2,
   ReadsPastVLMask = 1ULL << ReadsPastVLShift,
+
+  // 0 -> Don't care about altfmt bit in VTYPE.
+  // 1 -> Is not altfmt.
+  // 2 -> Is altfmt(BF16).
+  AltFmtTypeShift = ReadsPastVLShift + 1,
+  AltFmtTypeMask = 3ULL << AltFmtTypeShift,
+
+  // XSfmmbase
+  HasTWidenOpShift = AltFmtTypeShift + 2,
+  HasTWidenOpMask = 1ULL << HasTWidenOpShift,
+
+  HasTMOpShift = HasTWidenOpShift + 1,
+  HasTMOpMask = 1ULL << HasTMOpShift,
+
+  HasTKOpShift = HasTMOpShift + 1,
+  HasTKOpMask = 1ULL << HasTKOpShift,
 };
 
 // Helper functions to read TSFlags.
@@ -183,6 +199,11 @@ static inline bool hasRoundModeOp(uint64_t TSFlags) {
   return TSFlags & HasRoundModeOpMask;
 }
 
+enum class AltFmtType { DontCare, NotAltFmt, AltFmt };
+static inline AltFmtType getAltFmtType(uint64_t TSFlags) {
+  return static_cast<AltFmtType>((TSFlags & AltFmtTypeMask) >> AltFmtTypeShift);
+}
+
 /// \returns true if this instruction uses vxrm
 static inline bool usesVXRM(uint64_t TSFlags) { return TSFlags & UsesVXRMMask; }
 
@@ -204,11 +225,47 @@ static inline bool readsPastVL(uint64_t TSFlags) {
   return TSFlags & ReadsPastVLMask;
 }
 
+// XSfmmbase
+static inline bool hasTWidenOp(uint64_t TSFlags) {
+  return TSFlags & HasTWidenOpMask;
+}
+
+static inline bool hasTMOp(uint64_t TSFlags) { return TSFlags & HasTMOpMask; }
+
+static inline bool hasTKOp(uint64_t TSFlags) { return TSFlags & HasTKOpMask; }
+
+static inline unsigned getTNOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasVLOp(TSFlags));
+  unsigned Offset = 3;
+  if (hasTKOp(TSFlags))
+    Offset = 4;
+  return Desc.getNumOperands() - Offset;
+}
+
+static inline unsigned getTMOpNum(const MCInstrDesc &Desc) {
+  const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasTMOp(TSFlags));
+  if (hasTKOp(TSFlags))
+    return Desc.getNumOperands() - 5;
+  // vtzero.t
+  return Desc.getNumOperands() - 4;
+}
+
+static inline unsigned getTKOpNum(const MCInstrDesc &Desc) {
+  [[maybe_unused]] const uint64_t TSFlags = Desc.TSFlags;
+  assert(hasTWidenOp(TSFlags) && hasTKOp(TSFlags));
+  return Desc.getNumOperands() - 3;
+}
+
 static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   // This method is only called if we expect to have a VL operand, and all
   // instructions with VL also have SEW.
   assert(hasSEWOp(TSFlags) && hasVLOp(TSFlags));
+  // In Xsfmmbase, TN is an alias for VL, so here we use the same TSFlags bit.
+  if (hasTWidenOp(TSFlags))
+    return getTNOpNum(Desc);
   unsigned Offset = 2;
   if (hasVecPolicyOp(TSFlags))
     Offset = 3;
@@ -226,7 +283,7 @@ static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   assert(hasSEWOp(TSFlags));
   unsigned Offset = 1;
-  if (hasVecPolicyOp(TSFlags))
+  if (hasVecPolicyOp(TSFlags) || hasTWidenOp(TSFlags))
     Offset = 2;
   return Desc.getNumOperands() - Offset;
 }
@@ -243,6 +300,9 @@ static inline int getFRMOpNum(const MCInstrDesc &Desc) {
   if (!hasRoundModeOp(TSFlags) || usesVXRM(TSFlags))
     return -1;
 
+  if (hasTWidenOp(TSFlags) && hasTMOp(TSFlags))
+    return getTMOpNum(Desc) - 1;
+
   // The operand order
   // --------------------------------------
   // | n-1 (if any)   | n-2  | n-3 | n-4 |
@@ -385,7 +445,9 @@ enum OperandType : unsigned {
   OPERAND_SEW_MASK,
   // Vector rounding mode for VXRM or FRM.
   OPERAND_VEC_RM,
-  OPERAND_LAST_RISCV_IMM = OPERAND_VEC_RM,
+  // Vtype operand for XSfmm extension.
+  OPERAND_XSFMM_VTYPE,
+  OPERAND_LAST_RISCV_IMM = OPERAND_XSFMM_VTYPE,
   // Operand is either a register or uimm5, this is used by V extension pseudo
   // instructions to represent a value that be passed as AVL to either vsetvli
   // or vsetivli.
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index cf8d120..9ed3b97 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -168,10 +168,13 @@ struct DemandedFields {
   // If this is true, we demand that VTYPE is set to some legal state, i.e. that
   // vill is unset.
   bool VILL = false;
+  bool UseTWiden = false;
+  bool UseAltFmt = false;
 
   // Return true if any part of VTYPE was used
   bool usedVTYPE() const {
-    return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL;
+    return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL ||
+           UseTWiden || UseAltFmt;
   }
 
   // Return true if any property of VL was used
@@ -187,6 +190,8 @@ struct DemandedFields {
     TailPolicy = true;
     MaskPolicy = true;
     VILL = true;
+    UseTWiden = true;
+    UseAltFmt = true;
   }
 
   // Mark all VL properties as demanded
@@ -212,6 +217,8 @@ struct DemandedFields {
     TailPolicy |= B.TailPolicy;
     MaskPolicy |= B.MaskPolicy;
     VILL |= B.VILL;
+    UseAltFmt |= B.UseAltFmt;
+    UseTWiden |= B.UseTWiden;
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -258,7 +265,9 @@ struct DemandedFields {
     OS << "SEWLMULRatio=" << SEWLMULRatio << ", ";
     OS << "TailPolicy=" << TailPolicy << ", ";
     OS << "MaskPolicy=" << MaskPolicy << ", ";
-    OS << "VILL=" << VILL;
+    OS << "VILL=" << VILL << ", ";
+    OS << "UseAltFmt=" << UseAltFmt << ", ";
+    OS << "UseTWiden=" << UseTWiden;
     OS << "}";
   }
 #endif
@@ -328,6 +337,15 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
   if (Used.MaskPolicy && RISCVVType::isMaskAgnostic(CurVType) !=
                              RISCVVType::isMaskAgnostic(NewVType))
     return false;
+  if (Used.UseTWiden && (RISCVVType::hasXSfmmWiden(CurVType) !=
+                             RISCVVType::hasXSfmmWiden(NewVType) ||
+                         (RISCVVType::hasXSfmmWiden(CurVType) &&
+                          RISCVVType::getXSfmmWiden(CurVType) !=
+                              RISCVVType::getXSfmmWiden(NewVType))))
+    return false;
+  if (Used.UseAltFmt &&
+      RISCVVType::isAltFmt(CurVType) != RISCVVType::isAltFmt(NewVType))
+    return false;
   return true;
 }
 
@@ -479,6 +497,11 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     Res.TailPolicy = false;
   }
 
+  Res.UseAltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) !=
+                  RISCVII::AltFmtType::DontCare;
+  Res.UseTWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+                  RISCVInstrInfo::isXSfmmVectorConfigInstr(MI);
+
   return Res;
 }
 
@@ -510,6 +533,8 @@ class VSETVLIInfo {
   uint8_t TailAgnostic : 1;
   uint8_t MaskAgnostic : 1;
   uint8_t SEWLMULRatioOnly : 1;
+  uint8_t AltFmt : 1;
+  uint8_t TWiden : 3;
 
 public:
   VSETVLIInfo()
@@ -586,6 +611,8 @@ public:
   RISCVVType::VLMUL getVLMUL() const { return VLMul; }
   bool getTailAgnostic() const { return TailAgnostic; }
   bool getMaskAgnostic() const { return MaskAgnostic; }
+  bool getAltFmt() const { return AltFmt; }
+  unsigned getTWiden() const { return TWiden; }
 
   bool hasNonZeroAVL(const LiveIntervals *LIS) const {
     if (hasAVLImm())
@@ -647,21 +674,31 @@ public:
     SEW = RISCVVType::getSEW(VType);
     TailAgnostic = RISCVVType::isTailAgnostic(VType);
     MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
+    AltFmt = RISCVVType::isAltFmt(VType);
+    TWiden =
+        RISCVVType::hasXSfmmWiden(VType) ? RISCVVType::getXSfmmWiden(VType) : 0;
   }
-  void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA) {
+  void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA, bool Altfmt,
+                unsigned W) {
     assert(isValid() && !isUnknown() &&
            "Can't set VTYPE for uninitialized or unknown");
     VLMul = L;
     SEW = S;
     TailAgnostic = TA;
     MaskAgnostic = MA;
+    AltFmt = Altfmt;
+    TWiden = W;
   }
 
+  void setAltFmt(bool AF) { AltFmt = AF; }
+
   void setVLMul(RISCVVType::VLMUL VLMul) { this->VLMul = VLMul; }
 
   unsigned encodeVTYPE() const {
     assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
            "Can't encode VTYPE for uninitialized or unknown");
+    if (TWiden != 0)
+      return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt);
     return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
   }
 
@@ -674,9 +711,9 @@ public:
            "Can't compare VTYPE in unknown state");
     assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly &&
            "Can't compare when only LMUL/SEW ratio is valid.");
-    return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic) ==
+    return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden) ==
            std::tie(Other.VLMul, Other.SEW, Other.TailAgnostic,
-                    Other.MaskAgnostic);
+                    Other.MaskAgnostic, Other.AltFmt, Other.TWiden);
   }
 
   unsigned getSEWLMULRatio() const {
@@ -825,7 +862,9 @@ public:
        << "SEW=e" << (unsigned)SEW << ", "
        << "TailAgnostic=" << (bool)TailAgnostic << ", "
        << "MaskAgnostic=" << (bool)MaskAgnostic << ", "
-       << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << "}";
+       << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", "
+       << "TWiden=" << (unsigned)TWiden << ", "
+       << "AltFmt=" << (bool)AltFmt << "}";
   }
 #endif
 };
@@ -853,6 +892,11 @@ struct BlockData {
   BlockData() = default;
 };
 
+enum TKTMMode {
+  VSETTK = 0,
+  VSETTM = 1,
+};
+
 class RISCVInsertVSETVLI : public MachineFunctionPass {
   const RISCVSubtarget *ST;
   const TargetInstrInfo *TII;
@@ -908,6 +952,7 @@ private:
   VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
   VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
   void forwardVSETVLIAVL(VSETVLIInfo &Info) const;
+  bool insertVSETMTK(MachineBasicBlock &MBB, TKTMMode Mode) const;
 };
 
 } // end anonymous namespace
@@ -945,6 +990,18 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
   VSETVLIInfo NewInfo;
   if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
     NewInfo.setAVLImm(MI.getOperand(1).getImm());
+  } else if (RISCVInstrInfo::isXSfmmVectorConfigTNInstr(MI)) {
+    assert(MI.getOpcode() == RISCV::PseudoSF_VSETTNT ||
+           MI.getOpcode() == RISCV::PseudoSF_VSETTNTX0);
+    switch (MI.getOpcode()) {
+    case RISCV::PseudoSF_VSETTNTX0:
+      NewInfo.setAVLVLMAX();
+      break;
+    case RISCV::PseudoSF_VSETTNT:
+      Register ATNReg = MI.getOperand(1).getReg();
+      NewInfo.setAVLRegDef(getVNInfoFromReg(ATNReg, MI, LIS), ATNReg);
+      break;
+    }
   } else {
     assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
            MI.getOpcode() == RISCV::PseudoVSETVLIX0);
@@ -1005,11 +1062,34 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 
   RISCVVType::VLMUL VLMul = RISCVII::getLMul(TSFlags);
 
+  bool AltFmt = RISCVII::getAltFmtType(TSFlags) == RISCVII::AltFmtType::AltFmt;
+  InstrInfo.setAltFmt(AltFmt);
+
   unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
   // A Log2SEW of 0 is an operation on mask registers only.
   unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
   assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
 
+  if (RISCVII::hasTWidenOp(TSFlags)) {
+    const MachineOperand &TWidenOp =
+        MI.getOperand(MI.getNumExplicitOperands() - 1);
+    unsigned TWiden = TWidenOp.getImm();
+
+    InstrInfo.setAVLVLMAX();
+    if (RISCVII::hasVLOp(TSFlags)) {
+      const MachineOperand &TNOp =
+          MI.getOperand(RISCVII::getTNOpNum(MI.getDesc()));
+
+      if (TNOp.getReg().isVirtual())
+        InstrInfo.setAVLRegDef(getVNInfoFromReg(TNOp.getReg(), MI, LIS),
+                               TNOp.getReg());
+    }
+
+    InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden);
+
+    return InstrInfo;
+  }
+
   if (RISCVII::hasVLOp(TSFlags)) {
     const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
     if (VLOp.isImm()) {
@@ -1045,7 +1125,9 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
     assert(SEW == EEW && "Initial SEW doesn't match expected EEW");
   }
 #endif
-  InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+  // TODO: Propagate the twiden from previous vtype for potential reuse.
+  InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt,
+                     /*TWiden*/ 0);
 
   forwardVSETVLIAVL(InstrInfo);
 
@@ -1053,10 +1135,33 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 }
 
 void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator InsertPt, DebugLoc DL,
-                     const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
-
+                                       MachineBasicBlock::iterator InsertPt,
+                                       DebugLoc DL, const VSETVLIInfo &Info,
+                                       const VSETVLIInfo &PrevInfo) {
   ++NumInsertedVSETVL;
+
+  if (Info.getTWiden()) {
+    if (Info.hasAVLVLMAX()) {
+      Register DestReg = MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass);
+      auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNTX0))
+                    .addReg(DestReg, RegState::Define | RegState::Dead)
+                    .addReg(RISCV::X0, RegState::Kill)
+                    .addImm(Info.encodeVTYPE());
+      if (LIS) {
+        LIS->InsertMachineInstrInMaps(*MI);
+        LIS->createAndComputeVirtRegInterval(DestReg);
+      }
+    } else {
+      auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNT))
+                    .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+                    .addReg(Info.getAVLReg())
+                    .addImm(Info.encodeVTYPE());
+      if (LIS)
+        LIS->InsertMachineInstrInMaps(*MI);
+    }
+    return;
+  }
+
   if (PrevInfo.isValid() && !PrevInfo.isUnknown()) {
     // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
     // VLMAX.
@@ -1198,7 +1303,8 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
     // be coalesced into another vsetvli since we won't demand any fields.
     VSETVLIInfo NewInfo; // Need a new VSETVLIInfo to clear SEWLMULRatioOnly
     NewInfo.setAVLImm(1);
-    NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true);
+    NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true,
+                     /*AltFmt*/ false, /*W*/ 0);
     Info = NewInfo;
     return;
   }
@@ -1240,7 +1346,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
       (Demanded.TailPolicy ? IncomingInfo : Info).getTailAgnostic() ||
           IncomingInfo.getTailAgnostic(),
       (Demanded.MaskPolicy ? IncomingInfo : Info).getMaskAgnostic() ||
-          IncomingInfo.getMaskAgnostic());
+          IncomingInfo.getMaskAgnostic(),
+      (Demanded.UseAltFmt ? IncomingInfo : Info).getAltFmt(),
+      Demanded.UseTWiden ? IncomingInfo.getTWiden() : 0);
 
   // If we only knew the sew/lmul ratio previously, replace the VTYPE but keep
   // the AVL.
@@ -1293,7 +1401,8 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB,
 
     if (RISCVInstrInfo::isVectorConfigInstr(MI) ||
         RISCVII::hasSEWOp(MI.getDesc().TSFlags) ||
-        isVectorCopy(ST->getRegisterInfo(), MI))
+        isVectorCopy(ST->getRegisterInfo(), MI) ||
+        RISCVInstrInfo::isXSfmmVectorConfigInstr(MI))
       HadVectorOp = true;
 
     transferAfter(Info, MI);
@@ -1675,6 +1784,12 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   };
 
   for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+    // TODO: Support XSfmm.
+    if (RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+        RISCVInstrInfo::isXSfmmVectorConfigInstr(MI)) {
+      NextMI = nullptr;
+      continue;
+    }
 
     if (!RISCVInstrInfo::isVectorConfigInstr(MI)) {
       Used.doUnion(getDemanded(MI, ST));
@@ -1788,6 +1903,65 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
   }
 }
 
+bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB,
+                                       TKTMMode Mode) const {
+
+  bool Changed = false;
+  for (auto &MI : MBB) {
+    uint64_t TSFlags = MI.getDesc().TSFlags;
+    if (RISCVInstrInfo::isXSfmmVectorConfigTMTKInstr(MI) ||
+        !RISCVII::hasSEWOp(TSFlags) || !RISCVII::hasTWidenOp(TSFlags))
+      continue;
+
+    VSETVLIInfo CurrInfo = computeInfoForInstr(MI);
+
+    if (Mode == VSETTK && !RISCVII::hasTKOp(TSFlags))
+      continue;
+
+    if (Mode == VSETTM && !RISCVII::hasTMOp(TSFlags))
+      continue;
+
+    unsigned OpNum = 0;
+    unsigned Opcode = 0;
+    switch (Mode) {
+    case VSETTK:
+      OpNum = RISCVII::getTKOpNum(MI.getDesc());
+      Opcode = RISCV::PseudoSF_VSETTK;
+      break;
+    case VSETTM:
+      OpNum = RISCVII::getTMOpNum(MI.getDesc());
+      Opcode = RISCV::PseudoSF_VSETTM;
+      break;
+    }
+
+    assert(OpNum && Opcode && "Invalid OpNum or Opcode");
+
+    MachineOperand &Op = MI.getOperand(OpNum);
+
+    auto TmpMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opcode))
+                     .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+                     .addReg(Op.getReg())
+                     .addImm(Log2_32(CurrInfo.getSEW()))
+                     .addImm(Log2_32(CurrInfo.getTWiden()) + 1);
+
+    Changed = true;
+    Register Reg = Op.getReg();
+    Op.setReg(Register());
+    Op.setIsKill(false);
+    if (LIS) {
+      LIS->InsertMachineInstrInMaps(*TmpMI);
+      LiveInterval &LI = LIS->getInterval(Reg);
+
+      // Erase the AVL operand from the instruction.
+      LIS->shrinkToUses(&LI);
+      // TODO: Enable this once needVSETVLIPHI is supported.
+      // SmallVector<LiveInterval *> SplitLIs;
+      // LIS->splitSeparateComponents(LI, SplitLIs);
+    }
+  }
+  return Changed;
+}
+
 bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   // Skip if the vector extension is not enabled.
   ST = &MF.getSubtarget<RISCVSubtarget>();
@@ -1865,6 +2039,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     insertReadVL(MBB);
 
+  for (MachineBasicBlock &MBB : MF) {
+    insertVSETMTK(MBB, VSETTM);
+    insertVSETMTK(MBB, VSETTK);
+  }
+
   BlockInfo.clear();
   return HaveVectorOp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 2afd77a..5b06303 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -267,6 +267,22 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   // operands' VLs.
   bit ReadsPastVL = 0;
   let TSFlags{26} = ReadsPastVL;
+
+  // 0 -> Don't care about altfmt bit in VTYPE.
+  // 1 -> Is not altfmt.
+  // 2 -> Is altfmt(BF16).
+  bits<2> AltFmtType = 0;
+  let TSFlags{28-27} = AltFmtType;
+
+  // XSfmmbase
+  bit HasTWidenOp = 0;
+  let TSFlags{29} = HasTWidenOp;
+
+  bit HasTmOp = 0;
+  let TSFlags{30} = HasTmOp;
+
+  bit HasTkOp = 0;
+  let TSFlags{31} = HasTkOp;
 }
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 96e1078..ddb53a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3005,6 +3005,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           else
             Ok = RISCVFPRndMode::isValidRoundingMode(Imm);
           break;
+        case RISCVOp::OPERAND_XSFMM_VTYPE:
+          Ok = RISCVVType::isValidXSfmmVType(Imm);
+          break;
         }
         if (!Ok) {
           ErrInfo = "Invalid immediate";
@@ -3670,6 +3673,11 @@ std::string RISCVInstrInfo::createMIROperandComment(
     RISCVVType::printVType(Imm, OS);
     break;
   }
+  case RISCVOp::OPERAND_XSFMM_VTYPE: {
+    unsigned Imm = Op.getImm();
+    RISCVVType::printXSfmmVType(Imm, OS);
+    break;
+  }
   case RISCVOp::OPERAND_SEW:
   case RISCVOp::OPERAND_SEW_MASK: {
     unsigned Log2SEW = Op.getImm();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 298d35a..c1b23af 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -128,6 +128,9 @@ defvar TAIL_AGNOSTIC = 1;
 defvar TU_MU = 0;
 defvar TA_MU = 1;
 defvar TA_MA = 3;
+defvar DONT_CARE_ALTFMT = 0;
+defvar IS_NOT_ALTFMT = 1;
+defvar IS_ALTFMT = 2;
 
 //===----------------------------------------------------------------------===//
 // Utilities.
@@ -159,7 +162,8 @@ class PseudoToVInst<string PseudoInst> {
                         ["_M4", ""],
                         ["_M8", ""],
                         ["_SE", ""],
-                        ["_RM", ""]
+                        ["_RM", ""],
+                        ["_ALT", ""]
                        ];
   string VInst = !foldl(PseudoInst, AffixSubsts, Acc, AffixSubst,
                         !subst(AffixSubst[0], AffixSubst[1], Acc));
@@ -6396,7 +6400,7 @@ let Defs = [VXSAT] in {
 // 13. Vector Floating-Point Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 //===----------------------------------------------------------------------===//
 // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
@@ -6565,7 +6569,7 @@ defm PseudoVFNCVT_F_F      : VPseudoVNCVTD_W_RM;
 
 defm PseudoVFNCVT_ROD_F_F  : VPseudoVNCVTD_W;
 } // mayRaiseFPException = true
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 14. Vector Reduction Operations
@@ -6593,7 +6597,7 @@ defm PseudoVWREDSUM    : VPseudoVWRED_VS;
 }
 } // Predicates = [HasVInstructions]
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 //===----------------------------------------------------------------------===//
 // 14.3. Vector Single-Width Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
@@ -6612,7 +6616,7 @@ defm PseudoVFWREDUSUM  : VPseudoVFWRED_VS_RM;
 defm PseudoVFWREDOSUM  : VPseudoVFWREDO_VS_RM;
 }
 
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 15. Vector Mask Instructions
@@ -6703,7 +6707,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 // 16.2. Floating-Point Scalar Move Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach f = FPList in {
     let HasSEWOp = 1, BaseInstr = VFMV_F_S in
@@ -6718,7 +6722,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
       Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
   }
 }
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 16.3. Vector Slide Instructions
@@ -6730,10 +6734,10 @@ let Predicates = [HasVInstructions] in {
   defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX;
 } // Predicates = [HasVInstructions]
 
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
   defm PseudoVFSLIDE1UP  : VPseudoVSLD1_VF<"@earlyclobber $rd">;
   defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF;
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
 
 //===----------------------------------------------------------------------===//
 // 16.4. Vector Register Gather Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 557d873..6a4119a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -438,8 +438,10 @@ let Predicates = [HasVendorXSfvcp] in {
   }
   foreach f = FPList in {
     foreach m = f.MxList in {
-    defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
-    defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+      let AltFmtType = IS_NOT_ALTFMT in {
+        defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
+        defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+      }
     }
   }
   foreach m = MxListW in {
@@ -449,7 +451,8 @@ let Predicates = [HasVendorXSfvcp] in {
   }
   foreach f = FPListW in {
     foreach m = f.MxList in
-    defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
+      let AltFmtType = IS_NOT_ALTFMT in
+        defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index a5ee701..5ad22e6b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -225,7 +225,7 @@ let Predicates = [HasVendorXSfmmbase] in {
   def SF_VSETTM : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00001,
                                   "sf.vsettm", "$rd, $rs1">;
   def SF_VSETTK : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00010,
-                                   "sf.vsettk", "$rd, $rs1">;
+                                  "sf.vsettk", "$rd, $rs1">;
   def SF_VTDISCARD : SFInstVtDiscard<"sf.vtdiscard">;
 
   def SF_VTMV_V_T : SFInstTileMoveOp<0b010000, (outs VR:$vd), (ins GPR:$rs1),
@@ -277,3 +277,144 @@ let Uses = [FRM], mayRaiseFPException = true in {
 } // Predicates = [HasVendorXSfmm32a8f]
 
 } // DecoderNamespace = "XSfvector"
+
+class VPseudoSF_VTileLoad
+    : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileStore
+    : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_V_T
+    : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+                                         ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_T_V
+    : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, AVL:$atn, ixlenimm:$sew,
+                                ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasVLOp = 1; // Tn
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul<RegisterClass mtd_class>
+    : RISCVVPseudo<(outs),
+                   (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, AVL:$atm, AVL:$atn,
+                        AVL:$atk, ixlenimm:$sew, ixlenimm:$twiden)> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasTmOp = 1;
+  let HasVLOp = 1; // Tn
+  let HasTkOp = 1;
+  let HasSEWOp = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul_FRM<RegisterClass mtd_class>
+    : RISCVVPseudo<(outs),
+                   (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, ixlenimm:$frm,
+                        AVL:$atm, AVL:$atn, AVL:$atk, ixlenimm:$sew,
+                        ixlenimm:$twiden), []> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let HasTmOp = 1;
+  let HasVLOp = 1; // Tn
+  let HasTkOp = 1;
+  let HasSEWOp = 1;
+  let HasRoundModeOp = 1;
+  let hasPostISelHook = 1;
+  let HasTWidenOp = 1;
+  let hasSideEffects = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let Defs = [VL, VTYPE] in {
+  def PseudoSF_VSETTNT
+      : Pseudo<(outs GPR:$rd),
+               (ins GPRNoX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTNTX0
+      : Pseudo<(outs GPRNoX0:$rd),
+               (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTNTX0X0
+      : Pseudo<(outs GPRX0:$rd),
+               (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+        PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+
+let Defs = [VTYPE], Uses = [VTYPE], HasTWidenOp = 1, HasSEWOp = 1 in {
+  def PseudoSF_VSETTM
+      : Pseudo<(outs GPR:$rd),
+               (ins GPR:$rs1, ixlenimm:$log2sew, ixlenimm:$twiden), []>,
+        PseudoInstExpansion<(SF_VSETTM GPR:$rd, GPR:$rs1)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoSF_VSETTK
+      : Pseudo<(outs GPR:$rd),
+               (ins GPR:$rs1, ixlenimm:$logwsew, ixlenimm:$twiden), []>,
+        PseudoInstExpansion<(SF_VSETTK GPR:$rd, GPR:$rs1)>,
+        Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+}
+
+foreach eew = [8, 16, 32, 64] in {
+  def PseudoSF_VLTE # eew : VPseudoSF_VTileLoad;
+  def PseudoSF_VSTE # eew : VPseudoSF_VTileStore;
+}
+
+def PseudoSF_VTMV_T_V : VPseudoSF_VTileMove_T_V;
+def PseudoSF_VTMV_V_T : VPseudoSF_VTileMove_V_T;
+
+foreach a = I8Encodes in
+  foreach b = I8Encodes in
+    def PseudoSF_MM_ # !toupper(a.Name) # _ # !toupper(b.Name)
+        : VPseudoSF_MatMul<TRM4>;
+
+let AltFmtType = IS_NOT_ALTFMT in
+  def PseudoSF_MM_F_F : VPseudoSF_MatMul_FRM<TRM2>;
+let AltFmtType = IS_ALTFMT in
+  def PseudoSF_MM_F_F_ALT : VPseudoSF_MatMul_FRM<TRM2>;
+
+foreach e1 = [5, 4] in
+  foreach e2 = [5, 4] in
+    def PseudoSF_MM_E # e1 # M # !sub(7, e1) # _E # e2 # M # !sub(7, e2)
+        : VPseudoSF_MatMul_FRM<TRM4>;
+
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+  let HasVLOp = 1, HasTmOp = 1, HasTWidenOp = 1, HasSEWOp = 1 in
+    def PseudoSF_VTZERO_T
+        : RISCVVPseudo<(outs),
+                       (ins TR:$rd, AVL:$atm, AVL:$atn, ixlenimm:$sew,
+                            ixlenimm:$twiden)>;
+  def PseudoSF_VTDISCARD : RISCVVPseudo<(outs), (ins), []>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 3658817..dcae977 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -78,7 +78,41 @@ def isVectorConfigInstr
                        PseudoVSETVLI,
                        PseudoVSETVLIX0,
                        PseudoVSETVLIX0X0,
-                       PseudoVSETIVLI
+                       PseudoVSETIVLI,
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0
+                     ]>>>;
+
+// Returns true if this is a PseudoSF_VSETTNT* instructions.
+def isXSfmmVectorConfigTNInstr
+    : TIIPredicate<"isXSfmmVectorConfigTNInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0
+                     ]>>>;
+
+// Returns true if this is PseudoSF_VSETTM or PseudoSF_VSETTK.
+def isXSfmmVectorConfigTMTKInstr
+    : TIIPredicate<"isXSfmmVectorConfigTMTKInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTM,
+                       PseudoSF_VSETTK
+                     ]>>>;
+
+// Returns true if this is a XSfmm vector configuration instruction.
+def isXSfmmVectorConfigInstr
+    : TIIPredicate<"isXSfmmVectorConfigInstr",
+                   MCReturnStatement<
+                     CheckOpcode<[
+                       PseudoSF_VSETTNT,
+                       PseudoSF_VSETTNTX0,
+                       PseudoSF_VSETTNTX0X0,
+                       PseudoSF_VSETTM,
+                       PseudoSF_VSETTK
                      ]>>>;
 
 // Return true if this is 'vsetvli x0, x0, vtype' which preserves
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 40b6416..e9f43b9 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -178,6 +178,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // Shadow stack pointer.
   markSuperRegs(Reserved, RISCV::SSP);
 
+  // XSfmmbase
+  for (MCPhysReg Reg = RISCV::T0; Reg <= RISCV::T15; Reg++)
+    markSuperRegs(Reserved, Reg);
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6472334..47c24fc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -317,6 +317,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
     }
 
+    if (Subtarget->hasFP16()) {
+      setOperationAction(ISD::FMA, MVT::v8f16, Legal);
+    }
+
+    if (Subtarget->hasRelaxedSIMD()) {
+      setOperationAction(ISD::FMULADD, MVT::v4f32, Legal);
+      setOperationAction(ISD::FMULADD, MVT::v2f64, Legal);
+    }
+
     // Partial MLA reductions.
     for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) {
       setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal);
@@ -1120,6 +1129,18 @@ WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd(
+    const MachineFunction &MF, EVT VT) const {
+  if (!Subtarget->hasFP16() || !VT.isVector())
+    return false;
+
+  EVT ScalarVT = VT.getScalarType();
+  if (!ScalarVT.isSimple())
+    return false;
+
+  return ScalarVT.getSimpleVT().SimpleTy == MVT::f16;
+}
+
 bool WebAssemblyTargetLowering::shouldSimplifyDemandedVectorElts(
     SDValue Op, const TargetLoweringOpt &TLO) const {
   // ISel process runs DAGCombiner after legalization; this step is called
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b33a853..472ec67 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -81,6 +81,8 @@ private:
 
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(MVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT VT) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 49af78b..0f6e1ca 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1213,6 +1213,27 @@ defm EXTMUL_LOW_U :
 defm EXTMUL_HIGH_U :
   SIMDExtBinary<I64x2, extmul_high_u, "extmul_high_i32x4_u", 0xdf>;
 
+// Pattern for i32x4.dot_i16x8_s
+def : Pat<
+  (v4i32 (add
+    (wasm_shuffle
+      (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+      (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+      (i32 0), (i32 1), (i32 2), (i32 3),
+      (i32 8), (i32 9), (i32 10), (i32 11),
+      (i32 16), (i32 17), (i32 18), (i32 19),
+      (i32 24), (i32 25), (i32 26), (i32 27)),
+    (wasm_shuffle
+      (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+      (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+      (i32 4), (i32 5), (i32 6), (i32 7),
+      (i32 12), (i32 13), (i32 14), (i32 15),
+      (i32 20), (i32 21), (i32 22), (i32 23),
+      (i32 28), (i32 29), (i32 30), (i32 31)))
+  ),
+  (v4i32 (DOT v8i16:$lhs, v8i16:$rhs))
+>;
+
 //===----------------------------------------------------------------------===//
 // Floating-point unary arithmetic
 //===----------------------------------------------------------------------===//
@@ -1626,7 +1647,8 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero,
 // Relaxed (Negative) Multiply-Add  (madd/nmadd)
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> {
+multiclass RELAXED_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+                            list<Predicate> reqs> {
   defm MADD_#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
            [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd
@@ -1640,16 +1662,46 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
            vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
            vec.prefix#".relaxed_nmadd", simdopS, reqs>;
 
-  def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
-             (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+  def : Pat<(fadd_contract (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b)), (vec.vt V128:$c)),
+            (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+  def : Pat<(fmuladd (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)),
+             (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
 
-  def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
-             (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+  def : Pat<(fsub_contract (vec.vt V128:$c), (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b))),
+            (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+  def : Pat<(fmuladd (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)),
+             (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
 }
 
-defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+defm "" : RELAXED_SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
+defm "" : RELAXED_SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
+
+//===----------------------------------------------------------------------===//
+// FP16 (Negative) Multiply-Add  (madd/nmadd)
+//===----------------------------------------------------------------------===//
+
+multiclass HALF_PRECISION_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+                                   list<Predicate> reqs> {
+  defm MADD_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+           [(set (vec.vt V128:$dst), (fma
+             (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".madd\t$dst, $a, $b, $c",
+           vec.prefix#".madd", simdopA, reqs>;
+  defm NMADD_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+           [(set (vec.vt V128:$dst), (fma
+             (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".nmadd\t$dst, $a, $b, $c",
+           vec.prefix#".nmadd", simdopS, reqs>;
+}
+defm "" : HALF_PRECISION_SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+
+// TODO: I think separate intrinsics should be introduced for these FP16 operations.
+def : Pat<(v8f16 (int_wasm_relaxed_madd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+          (MADD_F16x8 V128:$a, V128:$b, V128:$c)>;
+def : Pat<(v8f16 (int_wasm_relaxed_nmadd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+          (NMADD_F16x8 V128:$a, V128:$b, V128:$c)>;
 
 //===----------------------------------------------------------------------===//
 // Laneselect
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index acf8e4c..5ea63a9 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -228,6 +228,10 @@ void printVType(unsigned VType, raw_ostream &OS) {
     OS << ", mu";
 }
 
+void printXSfmmVType(unsigned VType, raw_ostream &OS) {
+  OS << "e" << getSEW(VType) << ", w" << getXSfmmWiden(VType);
+}
+
 unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul) {
   unsigned LMul;
   bool Fractional;
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h
index 26ec4f3..e05fe28 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCloner.h
+++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h
@@ -1,3 +1,4 @@
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,9 +20,7 @@
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
 
 enum class CloneKind {
   /// The shared resume function for a switch lowering.
@@ -149,8 +148,6 @@ public:
   }
 };
 
-} // end namespace coro
-
-} // end namespace llvm
+} // end namespace llvm::coro
 
 #endif // LLVM_LIB_TRANSFORMS_COROUTINES_COROCLONER_H
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 471b9eb..cdb5852 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -38,7 +38,7 @@ public:
         AnyResumeFnPtrTy(PointerType::getUnqual(Context)) {}
   void lowerEarlyIntrinsics(Function &F);
 };
-}
+} // namespace
 
 // Replace a direct call to coro.resume or coro.destroy with an indirect call to
 // an address returned by coro.subfn.addr intrinsic. This is done so that
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 52f4ffe..cc47a55 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -16,11 +16,7 @@
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include "llvm/Transforms/Coroutines/CoroShape.h"
 
-namespace llvm {
-
-class CallGraph;
-
-namespace coro {
+namespace llvm::coro {
 
 bool isSuspendBlock(BasicBlock *BB);
 bool declaresAnyIntrinsic(const Module &M);
@@ -61,7 +57,6 @@ void normalizeCoroutine(Function &F, coro::Shape &Shape,
 CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
                              TargetTransformInfo &TTI,
                              ArrayRef<Value *> Arguments, IRBuilder<> &);
-} // End namespace coro.
-} // End namespace llvm
+} // End namespace llvm::coro
 
 #endif
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index 6aaabca..f2444da 100644
--- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -137,8 +137,7 @@ struct RematGraph {
 
 } // namespace
 
-namespace llvm {
-template <> struct GraphTraits<RematGraph *> {
+template <> struct llvm::GraphTraits<RematGraph *> {
   using NodeRef = RematGraph::RematNode *;
   using ChildIteratorType = RematGraph::RematNode **;
 
@@ -149,8 +148,6 @@ template <> struct GraphTraits<RematGraph *> {
   static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
 };
 
-} // end namespace llvm
-
 // For each instruction identified as materializable across the suspend point,
 // and its associated DAG of other rematerializable instructions,
 // recreate the DAG of instructions after the suspend point.
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index e474c07..81fe0c9 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -16,11 +16,8 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
-namespace llvm {
-
-namespace coro {
-
-namespace {
+using namespace llvm;
+using namespace llvm::coro;
 
 typedef SmallPtrSet<BasicBlock *, 8> VisitedBlocksSet;
 
@@ -71,7 +68,7 @@ static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
 /// This happens during the all-instructions iteration, so it must not
 /// delete the call.
 static Instruction *
-lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const coro::Shape &Shape,
+lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const Shape &Shape,
                     SmallVectorImpl<Instruction *> &DeadInsts) {
   IRBuilder<> Builder(AI);
   auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
@@ -450,10 +447,8 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape,
                        Visitor.getMayWriteBeforeCoroBegin());
 }
 
-} // namespace
-
-void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
-                           const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromArgs(SpillInfo &Spills, Function &F,
+                                 const SuspendCrossingInfo &Checker) {
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
     for (User *U : A.users())
@@ -461,7 +456,7 @@ void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
         Spills[&A].push_back(cast<Instruction>(U));
 }
 
-void collectSpillsAndAllocasFromInsts(
+void coro::collectSpillsAndAllocasFromInsts(
     SpillInfo &Spills, SmallVector<AllocaInfo, 8> &Allocas,
     SmallVector<Instruction *, 4> &DeadInstructions,
     SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
@@ -516,8 +511,8 @@ void collectSpillsAndAllocasFromInsts(
   }
 }
 
-void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
-                              const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
+                                    const SuspendCrossingInfo &Checker) {
   // We don't want the layout of coroutine frame to be affected
   // by debug information. So we only choose to salvage dbg.values for
   // whose value is already in the frame.
@@ -535,10 +530,9 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
 
 /// Async and Retcon{Once} conventions assume that all spill uses can be sunk
 /// after the coro.begin intrinsic.
-void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
-                                 CoroBeginInst *CoroBegin,
-                                 coro::SpillInfo &Spills,
-                                 SmallVectorImpl<coro::AllocaInfo> &Allocas) {
+void coro::sinkSpillUsesAfterCoroBegin(
+    const DominatorTree &Dom, CoroBeginInst *CoroBegin, coro::SpillInfo &Spills,
+    SmallVectorImpl<coro::AllocaInfo> &Allocas) {
   SmallSetVector<Instruction *, 32> ToMove;
   SmallVector<Instruction *, 32> Worklist;
 
@@ -582,8 +576,9 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
     Inst->moveBefore(InsertPt->getIterator());
 }
 
-BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
-                                         const DominatorTree &DT) {
+BasicBlock::iterator coro::getSpillInsertionPt(const coro::Shape &Shape,
+                                               Value *Def,
+                                               const DominatorTree &DT) {
   BasicBlock::iterator InsertPt;
   if (auto *Arg = dyn_cast<Argument>(Def)) {
     // For arguments, we will place the store instruction right after
@@ -625,7 +620,3 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
 
   return InsertPt;
 }
-
-} // End namespace coro.
-
-} // End namespace llvm.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7071876..943c223 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -471,7 +471,6 @@ private:
   Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
                                 unsigned Depth = 0);
 
-public:
   /// Create `select C, S1, S2`. Use only when the profile cannot be calculated
   /// from existing profile metadata: if the Function has profiles, this will
   /// set the profile of this select to "unknown".
@@ -484,6 +483,7 @@ public:
     return Sel;
   }
 
+public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
   void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 63e24a0..a330bb7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -110,8 +110,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I,
                               ShrAmt->getName() + ".z");
   // There is no existing !prof metadata we can derive the !prof metadata for
   // this select.
-  Value *Select = IC.createSelectInstWithUnknownProfile(ShrAmtZ, Lower, Upper);
-  IC.Builder.Insert(Select);
+  Value *Select = IC.Builder.CreateSelectWithUnknownProfile(ShrAmtZ, Lower,
+                                                            Upper, DEBUG_TYPE);
   Select->takeName(I);
   return Select;
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index c86092b..a6ec6c1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/StaticDataProfileInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
@@ -194,6 +195,30 @@ static bool isAllocationWithHotColdVariant(const Function *Callee,
   }
 }
 
+static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
+                                             AnnotationKind Kind) {
+  assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK &&
+         "Should not handle AnnotationOK here");
+  SmallString<32> Reason;
+  switch (Kind) {
+  case llvm::memprof::AnnotationKind::ExplicitSection:
+    ++NumOfMemProfExplicitSectionGlobalVars;
+    Reason.append("explicit section name");
+    break;
+  case llvm::memprof::AnnotationKind::DeclForLinker:
+    Reason.append("linker declaration");
+    break;
+  case llvm::memprof::AnnotationKind::ReservedName:
+    Reason.append("name starts with `llvm.`");
+    break;
+  default:
+    llvm_unreachable("Unexpected annotation kind");
+  }
+  LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
+                    << Reason << ".\n");
+  return;
+}
+
 struct AllocMatchInfo {
   uint64_t TotalSize = 0;
   AllocationType AllocType = AllocationType::None;
@@ -775,29 +800,13 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
   return PreservedAnalyses::none();
 }
 
-// Returns true iff the global variable has custom section either by
-// __attribute__((section("name")))
-// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
-// or #pragma clang section directives
-// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
-static bool hasExplicitSectionName(const GlobalVariable &GVar) {
-  if (GVar.hasSection())
-    return true;
-
-  auto Attrs = GVar.getAttributes();
-  if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
-      Attrs.hasAttribute("relro-section") ||
-      Attrs.hasAttribute("rodata-section"))
-    return true;
-  return false;
-}
-
 bool MemProfUsePass::annotateGlobalVariables(
     Module &M, const memprof::DataAccessProfData *DataAccessProf) {
   if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
     return false;
 
   if (!DataAccessProf) {
+    M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 0U);
     M.getContext().diagnose(DiagnosticInfoPGOProfile(
         MemoryProfileFileName.data(),
         StringRef("Data access profiles not found in memprof. Ignore "
@@ -805,6 +814,7 @@ bool MemProfUsePass::annotateGlobalVariables(
         DS_Warning));
     return false;
   }
+  M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 1U);
 
   bool Changed = false;
   // Iterate all global variables in the module and annotate them based on
@@ -815,13 +825,9 @@ bool MemProfUsePass::annotateGlobalVariables(
   for (GlobalVariable &GVar : M.globals()) {
     assert(!GVar.getSectionPrefix().has_value() &&
            "GVar shouldn't have section prefix yet");
-    if (GVar.isDeclarationForLinker())
-      continue;
-
-    if (hasExplicitSectionName(GVar)) {
-      ++NumOfMemProfExplicitSectionGlobalVars;
-      LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName()
-                        << " has explicit section name. Skip annotating.\n");
+    auto Kind = llvm::memprof::getAnnotationKind(GVar);
+    if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) {
+      HandleUnsupportedAnnotationKinds(GVar, Kind);
       continue;
     }
 
@@ -831,7 +837,6 @@ bool MemProfUsePass::annotateGlobalVariables(
     // TODO: Track string content hash in the profiles and compute it inside the
     // compiler to categeorize the hotness string literals.
     if (Name.starts_with(".str")) {
-
       LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n");
       continue;
     }
diff --git a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index ccb86eb..fb39fdd 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -269,8 +269,7 @@ static bool replaceIfIdentical(PHINode &PHI, PHINode &ReplPHI) {
 
 bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
                                    BasicBlock::phi_iterator FirstExistingPN) {
-  auto NewPHIs = make_range(BB->phis().begin(), FirstExistingPN);
-  assert(!PHIAreRefEachOther(NewPHIs));
+  assert(!PHIAreRefEachOther(make_range(BB->phis().begin(), FirstExistingPN)));
 
   // Deduplicate new PHIs first to reduce the number of comparisons on the
   // following new -> existing pass.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f95d288..88af2cf 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19460,7 +19460,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
       assert(getNumElements(Cond->getType()) == TrueNumElements &&
              "Cannot vectorize Instruction::Select");
-      Value *V = Builder.CreateSelect(Cond, True, False);
+      Value *V =
+          Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
       V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
@@ -23580,18 +23581,19 @@ class HorizontalReduction {
     switch (Kind) {
     case RecurKind::Or: {
       if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
-        return Builder.CreateSelect(
+        return Builder.CreateSelectWithUnknownProfile(
             LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
-            RHS, Name);
+            RHS, DEBUG_TYPE, Name);
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     }
     case RecurKind::And: {
       if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
-        return Builder.CreateSelect(
+        return Builder.CreateSelectWithUnknownProfile(
             LHS, RHS,
-            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
+            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
+            DEBUG_TYPE, Name);
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
@@ -23612,7 +23614,8 @@ class HorizontalReduction {
       if (UseSelect) {
         CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);
         Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+        return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
+                                                      Name);
       }
       [[fallthrough]];
     case RecurKind::FMax:
diff --git a/llvm/lib/XRay/BlockIndexer.cpp b/llvm/lib/XRay/BlockIndexer.cpp
index f4ba0eb..d0c6853 100644
--- a/llvm/lib/XRay/BlockIndexer.cpp
+++ b/llvm/lib/XRay/BlockIndexer.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/BlockIndexer.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BlockIndexer::visit(BufferExtents &) { return Error::success(); }
 
@@ -89,6 +89,3 @@ Error BlockIndexer::flush() {
   CurrentBlock.WallclockTime = nullptr;
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockPrinter.cpp b/llvm/lib/XRay/BlockPrinter.cpp
index 63a60c3..d85be5b 100644
--- a/llvm/lib/XRay/BlockPrinter.cpp
+++ b/llvm/lib/XRay/BlockPrinter.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/BlockPrinter.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BlockPrinter::visit(BufferExtents &R) {
   OS << "\n[New Block]\n";
@@ -108,6 +108,3 @@ Error BlockPrinter::visit(EndBufferRecord &R) {
     auto E = RP.visit(R);
     return E;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockVerifier.cpp b/llvm/lib/XRay/BlockVerifier.cpp
index 99f255e..e39f6b6 100644
--- a/llvm/lib/XRay/BlockVerifier.cpp
+++ b/llvm/lib/XRay/BlockVerifier.cpp
@@ -10,19 +10,18 @@
 
 #include <bitset>
 
-namespace llvm {
-namespace xray {
-namespace {
+using namespace llvm;
+using namespace llvm::xray;
 
-constexpr unsigned long long mask(BlockVerifier::State S) {
+static constexpr unsigned long long mask(BlockVerifier::State S) {
   return 1uLL << static_cast<std::size_t>(S);
 }
 
-constexpr std::size_t number(BlockVerifier::State S) {
+static constexpr std::size_t number(BlockVerifier::State S) {
   return static_cast<std::size_t>(S);
 }
 
-StringRef recordToString(BlockVerifier::State R) {
+static StringRef recordToString(BlockVerifier::State R) {
   switch (R) {
   case BlockVerifier::State::BufferExtents:
     return "BufferExtents";
@@ -53,6 +52,8 @@ StringRef recordToString(BlockVerifier::State R) {
   llvm_unreachable("Unkown state!");
 }
 
+namespace {
+
 struct Transition {
   BlockVerifier::State From;
   std::bitset<number(BlockVerifier::State::StateMax)> ToStates;
@@ -133,7 +134,7 @@ Error BlockVerifier::transition(State To) {
 
   CurrentRecord = To;
   return Error::success();
-} // namespace xray
+}
 
 Error BlockVerifier::visit(BufferExtents &) {
   return transition(State::BufferExtents);
@@ -201,6 +202,3 @@ Error BlockVerifier::verify() {
 }
 
 void BlockVerifier::reset() { CurrentRecord = State::Unknown; }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecordProducer.cpp b/llvm/lib/XRay/FDRRecordProducer.cpp
index 479b710..0f4eed1 100644
--- a/llvm/lib/XRay/FDRRecordProducer.cpp
+++ b/llvm/lib/XRay/FDRRecordProducer.cpp
@@ -10,8 +10,8 @@
 
 #include <cstdint>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 namespace {
 
@@ -31,8 +31,9 @@ enum MetadataRecordKinds : uint8_t {
   // This is an end marker, used to identify the upper bound for this enum.
   EnumEndMarker,
 };
+} // namespace
 
-Expected<std::unique_ptr<Record>>
+static Expected<std::unique_ptr<Record>>
 metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
 
   if (T >= static_cast<uint8_t>(MetadataRecordKinds::EnumEndMarker))
@@ -72,12 +73,10 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
   llvm_unreachable("Unhandled MetadataRecordKinds enum value");
 }
 
-constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
+static constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
   return FirstByte & 0x01u;
 }
 
-} // namespace
-
 Expected<std::unique_ptr<Record>>
 FileBasedRecordProducer::findNextBufferExtent() {
   // We seek one byte at a time until we find a suitable buffer extents metadata
@@ -193,6 +192,3 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
   assert(R != nullptr);
   return std::move(R);
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecords.cpp b/llvm/lib/XRay/FDRRecords.cpp
index ff315d3..a18f733 100644
--- a/llvm/lib/XRay/FDRRecords.cpp
+++ b/llvm/lib/XRay/FDRRecords.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error BufferExtents::apply(RecordVisitor &V) { return V.visit(*this); }
 Error WallclockRecord::apply(RecordVisitor &V) { return V.visit(*this); }
@@ -61,6 +61,3 @@ StringRef Record::kindToString(RecordKind K) {
   }
   return "Unknown";
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceExpander.cpp b/llvm/lib/XRay/FDRTraceExpander.cpp
index b68e997..991e6e5 100644
--- a/llvm/lib/XRay/FDRTraceExpander.cpp
+++ b/llvm/lib/XRay/FDRTraceExpander.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRTraceExpander.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 void TraceExpander::resetCurrentRecord() {
   if (BuildingRecord)
@@ -126,6 +126,3 @@ Error TraceExpander::flush() {
   resetCurrentRecord();
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceWriter.cpp b/llvm/lib/XRay/FDRTraceWriter.cpp
index fb59125..3e320a6 100644
--- a/llvm/lib/XRay/FDRTraceWriter.cpp
+++ b/llvm/lib/XRay/FDRTraceWriter.cpp
@@ -12,8 +12,8 @@
 #include "llvm/XRay/FDRTraceWriter.h"
 #include <tuple>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 namespace {
 
@@ -37,9 +37,10 @@ template <size_t Index> struct IndexedWriter {
     return 0;
   }
 };
+} // namespace
 
 template <uint8_t Kind, class... Values>
-Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
+static Error writeMetadata(support::endian::Writer &OS, Values &&...Ds) {
   // The first bit in the first byte of metadata records is always set to 1, so
   // we ensure this is the case when we write out the first byte of the record.
   uint8_t FirstByte = (static_cast<uint8_t>(Kind) << 1) | uint8_t{0x01u};
@@ -54,8 +55,6 @@ Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
   return Error::success();
 }
 
-} // namespace
-
 FDRTraceWriter::FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H)
     : OS(O, llvm::endianness::native) {
   // We need to re-construct a header, by writing the fields we care about for
@@ -146,6 +145,3 @@ Error FDRTraceWriter::visit(FunctionRecord &R) {
   OS.write(R.delta());
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FileHeaderReader.cpp b/llvm/lib/XRay/FileHeaderReader.cpp
index 6b6daf9..681cef7 100644
--- a/llvm/lib/XRay/FileHeaderReader.cpp
+++ b/llvm/lib/XRay/FileHeaderReader.cpp
@@ -7,12 +7,13 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FileHeaderReader.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 // Populates the FileHeader reference by reading the first 32 bytes of the file.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint64_t &OffsetPtr) {
+Expected<XRayFileHeader>
+xray::readBinaryFormatHeader(DataExtractor &HeaderExtractor,
+                             uint64_t &OffsetPtr) {
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
@@ -68,6 +69,3 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   OffsetPtr += 16;
   return std::move(FileHeader);
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/LogBuilderConsumer.cpp b/llvm/lib/XRay/LogBuilderConsumer.cpp
index ffb49f9..f0fc336 100644
--- a/llvm/lib/XRay/LogBuilderConsumer.cpp
+++ b/llvm/lib/XRay/LogBuilderConsumer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecordConsumer.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error LogBuilderConsumer::consume(std::unique_ptr<Record> R) {
   if (!R)
@@ -32,6 +32,3 @@ Error PipelineConsumer::consume(std::unique_ptr<Record> R) {
     Result = joinErrors(std::move(Result), R->apply(*V));
   return Result;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Profile.cpp b/llvm/lib/XRay/Profile.cpp
index 1b340e5..ecb767b 100644
--- a/llvm/lib/XRay/Profile.cpp
+++ b/llvm/lib/XRay/Profile.cpp
@@ -18,8 +18,8 @@
 #include "llvm/XRay/Trace.h"
 #include <memory>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Profile::Profile(const Profile &O) {
   // We need to re-create all the tries from the original (O), into the current
@@ -46,6 +46,7 @@ struct BlockHeader {
   uint32_t Number;
   uint64_t Thread;
 };
+} // namespace
 
 static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
                                              uint64_t &Offset) {
@@ -115,8 +116,6 @@ static Expected<Profile::Data> readData(DataExtractor &Extractor,
   return D;
 }
 
-} // namespace
-
 Error Profile::addBlock(Block &&B) {
   if (B.PathData.empty())
     return make_error<StringError>(
@@ -189,7 +188,7 @@ Profile::PathID Profile::internPath(ArrayRef<FuncID> P) {
   return Node->ID;
 }
 
-Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByThread(const Profile &L, const Profile &R) {
   Profile Merged;
   using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
   using PathDataMapPtr = std::unique_ptr<PathDataMap>;
@@ -228,7 +227,7 @@ Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
   return Merged;
 }
 
-Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByStack(const Profile &L, const Profile &R) {
   Profile Merged;
   using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
   PathDataMap PathData;
@@ -258,7 +257,7 @@ Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
   return Merged;
 }
 
-Expected<Profile> loadProfile(StringRef Filename) {
+Expected<Profile> xray::loadProfile(StringRef Filename) {
   Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
   if (!FdOrErr)
     return FdOrErr.takeError();
@@ -322,7 +321,7 @@ struct StackEntry {
 
 } // namespace
 
-Expected<Profile> profileFromTrace(const Trace &T) {
+Expected<Profile> xray::profileFromTrace(const Trace &T) {
   Profile P;
 
   // The implementation of the algorithm re-creates the execution of
@@ -397,6 +396,3 @@ Expected<Profile> profileFromTrace(const Trace &T) {
 
   return P;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordInitializer.cpp b/llvm/lib/XRay/RecordInitializer.cpp
index 68ab3db..83d5f14 100644
--- a/llvm/lib/XRay/RecordInitializer.cpp
+++ b/llvm/lib/XRay/RecordInitializer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/XRay/FDRRecords.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error RecordInitializer::visit(BufferExtents &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr, sizeof(uint64_t)))
@@ -426,6 +426,3 @@ Error RecordInitializer::visit(FunctionRecord &R) {
   assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordPrinter.cpp b/llvm/lib/XRay/RecordPrinter.cpp
index 32d4210..b9b7a16 100644
--- a/llvm/lib/XRay/RecordPrinter.cpp
+++ b/llvm/lib/XRay/RecordPrinter.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/Support/FormatVariadic.h"
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
 
 Error RecordPrinter::visit(BufferExtents &R) {
   OS << formatv("<Buffer: size = {0} bytes>", R.size()) << Delim;
@@ -103,6 +103,3 @@ Error RecordPrinter::visit(FunctionRecord &R) {
   OS << Delim;
   return Error::success();
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Trace.cpp b/llvm/lib/XRay/Trace.cpp
index 74515b1..14a3f01 100644
--- a/llvm/lib/XRay/Trace.cpp
+++ b/llvm/lib/XRay/Trace.cpp
@@ -29,11 +29,9 @@ using namespace llvm;
 using namespace llvm::xray;
 using llvm::yaml::Input;
 
-namespace {
-
-Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
-                         XRayFileHeader &FileHeader,
-                         std::vector<XRayRecord> &Records) {
+static Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
+                                XRayFileHeader &FileHeader,
+                                std::vector<XRayRecord> &Records) {
   if (Data.size() < 32)
     return make_error<StringError>(
         "Not enough bytes for an XRay log.",
@@ -265,8 +263,9 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
 /// what FunctionRecord instances use, and we no longer need to include the CPU
 /// id in the CustomEventRecord.
 ///
-Error loadFDRLog(StringRef Data, bool IsLittleEndian,
-                 XRayFileHeader &FileHeader, std::vector<XRayRecord> &Records) {
+static Error loadFDRLog(StringRef Data, bool IsLittleEndian,
+                        XRayFileHeader &FileHeader,
+                        std::vector<XRayRecord> &Records) {
 
   if (Data.size() < 32)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
@@ -348,8 +347,8 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
   return Error::success();
 }
 
-Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
-                  std::vector<XRayRecord> &Records) {
+static Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+                         std::vector<XRayRecord> &Records) {
   YAMLXRayTrace Trace;
   Input In(Data);
   In >> Trace;
@@ -376,7 +375,6 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
                  });
   return Error::success();
 }
-} // namespace
 
 Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
new file mode 100644
index 0000000..a4aad57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: buffer_load_lds_not_valu
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: buffer_load_lds_not_valu
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $exec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 0
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 1
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
+    ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+    ; CHECK-NEXT: S_ENDPGM 0
+    $exec = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sgpr_128 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
+    $m0 = S_MOV_B32 0
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    $m0 = S_MOV_B32 1
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 4, 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
new file mode 100644
index 0000000..389283a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v \
+# RUN:     -run-pass=phi-node-elimination,register-coalescer,riscv-insert-vsetvli | FileCheck %s
+
+--- |
+  define void @xsfmm_same_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 noundef %tm, i64 noundef %tn, i64 noundef %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define void @xsfmm_different_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 4)
+    ret void
+  }
+
+  define void @xsfmm_different_state_bf(<vscale x 32 x half> %tile1, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64 2, <vscale x 32 x bfloat> %tile2, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %1
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm2(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %tile, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %2 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %2
+  }
+
+  define void @consecutive_xsfmm(<vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, ptr %base) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> %tile, <vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    call void @llvm.riscv.sf.vste16.i64(i64 0, ptr %base, i64 %tn)
+    ret void
+  }
+
+  define i64 @vsettnt_max(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    %1 = call i64 @llvm.riscv.sf.vsettnt_max.i64(i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettm(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettn(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettn.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettk(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettk.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define void @sf_vtzero(i64 %tm, i64 %tn) {
+  entry:
+    call void @llvm.riscv.sf.vtzero.i64(i64 1, i64 %tm, i64 %tn, i64 3, i64 4)
+    ret void
+  }
+
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64, <vscale x 32 x half>, <vscale x 32 x half>, i64, i64, i64, i64)
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i64, i64, i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i8>, i64)
+  declare void @llvm.riscv.sf.vste16.i64(i64, ptr, i64)
+  declare i64 @llvm.riscv.sf.vsettnt_max.i64(i64, i64)
+  declare i64 @llvm.riscv.sf.vsettm.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettn.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettk.i64(i64, i64, i64)
+  declare void @llvm.riscv.sf.vtzero.i64(i64, i64, i64, i64, i64)
+...
+---
+name:            xsfmm_same_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_same_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1544 /* e16, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 4, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 4, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state_bf
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state_bf
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1288 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F_ALT $t2, [[COPY3]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F_ALT $t2, %1:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            interleave_rvv_and_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[PseudoSF_VTMV_V_T]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %5:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %5:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            interleave_rvv_and_xsfmm2
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm2
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[COPY2]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_1:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[PseudoVADD_VV_M8_]], [[PseudoVADD_VV_M8_]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_1]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %0:vrm8, %1:gprnox0, 3, 0
+    %5:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %6:vrm8 = PseudoVADD_VV_M8 $noreg, %4:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %6:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            consecutive_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+  - { reg: '$x12', virtual-reg: '%3' }
+  - { reg: '$x13', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-LABEL: name: consecutive_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:gprnox0 = COPY $x13
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY2]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY1]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY3]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY]], [[COPY]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY3]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[COPY1]], [[COPY2]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:vrm8 = COPY $v8m8
+    %1:gprnox0 = COPY $x10
+    %2:gprnox0 = COPY $x11
+    %3:gprnox0 = COPY $x12
+    %4:gprnox0 = COPY $x13
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 2, implicit $frm
+    PseudoSF_VSTE16 %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 1
+    PseudoRET
+...
+---
+name:            vsettnt_max
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: vsettnt_max
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_1:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    %2:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    %3:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %3:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettm
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettn
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettn
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[PseudoSF_VSETTNT:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNT [[COPY]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTNT]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTNT %0:gprnox0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettk
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettk
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTK]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            sf_vtzero
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+  - { id: 1, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+  - { reg: '$x11', virtual-reg: '%1' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: sf_vtzero
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1536 /* e8, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY]], 3, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_VTZERO_T $t1, $noreg, $noreg, 3, 4, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = COPY $x11
+    PseudoSF_VTZERO_T $t1, %0:gprnox0, %1:gprnox0, 3, 4
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
new file mode 100644
index 0000000..3654aae
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_1:
+; CHECK:         .functype dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+
+define <4 x i32> @dot_sext_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_2:
+; CHECK:         .functype dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle2, %shuffle1
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @dot_sext_self(<8 x i16> %v) {
+; CHECK-LABEL: dot_sext_self:
+; CHECK:         .functype dot_sext_self (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %mul = mul <8 x i32> %sext, %sext
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_zext(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_zext:
+; CHECK:         .functype dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_u
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_u
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %zext1 = zext <8 x i16> %a to <8 x i32>
+  %zext2 = zext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %zext1, %zext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_wrong_shuffle(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_wrong_shuffle:
+; CHECK:         .functype dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_s
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de3..600241a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -2,9 +2,278 @@
 
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,              | FileCheck %s --check-prefix=STRICT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+simd128                     | FileCheck %s --check-prefix=NOFP16
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers                                      | FileCheck %s --check-prefix=NOSIMD
 
 target triple = "wasm32"
 
+define half @fadd_fmul_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f16:
+; RELAXED:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fadd_fmul_contract_f16:
+; STRICT:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fadd_fmul_contract_f16:
+; NOFP16:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f16:
+; NOSIMD:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %mul = fmul contract half %b, %a
+  %add = fadd contract half %mul, %c
+  ret half %add
+}
+
+define half @fmuladd_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_contract_f16:
+; RELAXED:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_contract_f16:
+; STRICT:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_contract_f16:
+; NOFP16:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_contract_f16:
+; NOSIMD:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call contract half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+define half @fmuladd_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_f16:
+; RELAXED:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_f16:
+; STRICT:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_f16:
+; NOFP16:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_f16:
+; NOSIMD:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+
+define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f32:
+; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fadd_fmul_contract_f32:
+; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $1, $0
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f32:
+; NOFP16:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f32:
+; NOSIMD:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %mul = fmul contract float %b, %a
+  %add = fadd contract float %mul, %c
+  ret float %add
+}
+
+define float @fmuladd_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_contract_f32:
+; RELAXED:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f32:
+; STRICT:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f32:
+; NOFP16:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f32:
+; NOSIMD:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
+define float @fmuladd_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_f32:
+; RELAXED:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f32:
+; STRICT:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f32:
+; NOFP16:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f32:
+; NOSIMD:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
 define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_f64:
 ; RELAXED:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
@@ -19,16 +288,94 @@ define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:    f64.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f64:
+; NOFP16:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f64:
+; NOSIMD:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
   %mul = fmul contract double %b, %a
   %add = fadd contract double %mul, %c
   ret double %add
 }
 
+define double @fmuladd_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_f64:
+; RELAXED:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f64:
+; STRICT:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f64:
+; NOFP16:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f64:
+; NOSIMD:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
+define double @fmuladd_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_contract_f64:
+; RELAXED:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f64:
+; STRICT:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f64:
+; NOFP16:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f64:
+; NOSIMD:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
 define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_4xf32:
 ; RELAXED:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_4xf32:
@@ -37,31 +384,222 @@ define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_4xf32:
+; NOFP16:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_4xf32:
+; NOSIMD:         .functype fadd_fmul_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul contract <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
-
 define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_8xf16:
 ; RELAXED:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_8xf16:
 ; STRICT:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.add $push1=, $pop0, $2
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.madd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf16:
+; NOFP16:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf16:
+; NOSIMD:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x half> %b, %a
   %add = fadd contract <8 x half> %mul, %c
   ret <8 x half> %add
 }
 
-
 define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_4xf32:
 ; RELAXED:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
@@ -76,16 +614,412 @@ define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_4xf32:
+; NOFP16:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_4xf32:
+; NOSIMD:         .functype fadd_fmul_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_contract_8xf16:
+; NOFP16:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_contract_8xf16:
+; NOSIMD:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_8xf16:
+; NOFP16:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf16:
+; NOSIMD:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
 define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_contract_4xf32:
 ; RELAXED:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_contract_4xf32:
@@ -94,18 +1028,40 @@ define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_4xf32:
+; NOFP16:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_4xf32:
+; NOSIMD:         .functype fmuladd_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
-; TODO: This should also have relaxed_madd in RELAXED case
 define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_4xf32:
 ; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT:    return $pop1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_4xf32:
 ; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -113,10 +1069,170 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_4xf32:
+; NOFP16:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_4xf32:
+; NOSIMD:         .functype fmuladd_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
+define <8 x float> @fmuladd_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fmuladd_8xf32:
+; RELAXED:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.mul $push0=, $2, $4
+; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $6
+; RELAXED-NEXT:    v128.store 16($0), $pop1
+; RELAXED-NEXT:    f32x4.mul $push2=, $1, $3
+; RELAXED-NEXT:    f32x4.add $push3=, $pop2, $5
+; RELAXED-NEXT:    v128.store 0($0), $pop3
+; RELAXED-NEXT:    return
+;
+; STRICT-LABEL: fmuladd_8xf32:
+; STRICT:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $2, $4
+; STRICT-NEXT:    f32x4.add $push1=, $pop0, $6
+; STRICT-NEXT:    v128.store 16($0), $pop1
+; STRICT-NEXT:    f32x4.mul $push2=, $1, $3
+; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
+; STRICT-NEXT:    v128.store 0($0), $pop3
+; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fmuladd_8xf32:
+; NOFP16:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $2, $4
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $1, $3
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf32:
+; NOSIMD:         .functype fmuladd_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $16
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $15
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $14
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $13
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $4, $12
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $3, $11
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $2, $10
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $1, $9
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
+  %fma = call <8 x float> @llvm.fmuladd(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  ret <8 x float> %fma
+}
+
+define <2 x double> @fmuladd_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_contract_2xf64:
+; RELAXED:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_2xf64:
+; STRICT:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_2xf64:
+; NOFP16:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_2xf64:
+; NOSIMD:         .functype fmuladd_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call contract <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_2xf64:
+; NOFP16:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_2xf64:
+; NOSIMD:         .functype fmuladd_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
 define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fma_4xf32:
 ; RELAXED:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
@@ -167,6 +1283,44 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; STRICT-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
 ; STRICT-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
 ; STRICT-NEXT:    return $pop19
+;
+; NOFP16-LABEL: fma_4xf32:
+; NOFP16:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.extract_lane $push2=, $0, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push1=, $1, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push0=, $2, 0
+; NOFP16-NEXT:    call $push3=, fmaf, $pop2, $pop1, $pop0
+; NOFP16-NEXT:    f32x4.splat $push4=, $pop3
+; NOFP16-NEXT:    f32x4.extract_lane $push7=, $0, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push6=, $1, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push5=, $2, 1
+; NOFP16-NEXT:    call $push8=, fmaf, $pop7, $pop6, $pop5
+; NOFP16-NEXT:    f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; NOFP16-NEXT:    f32x4.extract_lane $push12=, $0, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push11=, $1, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push10=, $2, 2
+; NOFP16-NEXT:    call $push13=, fmaf, $pop12, $pop11, $pop10
+; NOFP16-NEXT:    f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; NOFP16-NEXT:    f32x4.extract_lane $push17=, $0, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push16=, $1, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push15=, $2, 3
+; NOFP16-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
+; NOFP16-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; NOFP16-NEXT:    return $pop19
+;
+; NOSIMD-LABEL: fma_4xf32:
+; NOSIMD:         .functype fma_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $4, $8, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop0
+; NOSIMD-NEXT:    call $push1=, fmaf, $3, $7, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop1
+; NOSIMD-NEXT:    call $push2=, fmaf, $2, $6, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop2
+; NOSIMD-NEXT:    call $push3=, fmaf, $1, $5, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
@@ -176,9 +1330,9 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fadd_fmul_contract_8xf32:
 ; RELAXED:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -192,17 +1346,56 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
 ; STRICT-NEXT:    v128.store 0($0), $pop3
 ; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf32:
+; NOFP16:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $4, $2
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $3, $1
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf32:
+; NOSIMD:         .functype fadd_fmul_contract_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $16, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $15, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $14, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $13, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $12, $4
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $11, $3
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $10, $2
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $9, $1
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x float> %b, %a
   %add = fadd contract <8 x float> %mul, %c
   ret <8 x float> %add
 }
 
-
 define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_2xf64:
 ; RELAXED:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_2xf64:
@@ -211,28 +1404,64 @@ define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_2xf64:
+; NOFP16:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_2xf64:
+; NOSIMD:         .functype fadd_fmul_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %mul = fmul contract <2 x double> %b, %a
   %add = fadd contract <2 x double> %mul, %c
   ret <2 x double> %add
 }
 
-define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fadd_fmul_contract_f32:
-; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+define <2 x double> @fadd_fmul_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_2xf64:
+; RELAXED:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32.mul $push0=, $1, $0
-; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    f64x2.mul $push0=, $1, $0
+; RELAXED-NEXT:    f64x2.add $push1=, $pop0, $2
 ; RELAXED-NEXT:    return $pop1
 ;
-; STRICT-LABEL: fadd_fmul_contract_f32:
-; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-LABEL: fadd_fmul_2xf64:
+; STRICT:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f32.mul $push0=, $1, $0
-; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
-  %mul = fmul contract float %b, %a
-  %add = fadd contract float %mul, %c
-  ret float %add
+;
+; NOFP16-LABEL: fadd_fmul_2xf64:
+; NOFP16:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_2xf64:
+; NOSIMD:         .functype fadd_fmul_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %mul = fmul <2 x double> %b, %a
+  %add = fadd <2 x double> %mul, %c
+  ret <2 x double> %add
 }
 
 define float @fma_f32(float %a, float %b, float %c) {
@@ -247,6 +1476,18 @@ define float @fma_f32(float %a, float %b, float %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fmaf, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f32:
+; NOFP16:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f32:
+; NOSIMD:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call float @llvm.fma(float %a, float %b, float %c)
   ret float %fma
 }
@@ -263,6 +1504,18 @@ define double @fma_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fma, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f64:
+; NOFP16:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fma, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f64:
+; NOSIMD:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fma, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call double @llvm.fma(double %a, double %b, double %c)
   ret double %fma
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
index 6e2d860..b90c1da 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -27,7 +27,7 @@ define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; RELAXED-LABEL: fsub_fmul_contract_4xf32:
 ; RELAXED:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_4xf32:
@@ -46,15 +46,14 @@ define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
 ; RELAXED-LABEL: fsub_fmul_contract_8xf16:
 ; RELAXED:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_8xf16:
 ; STRICT:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.sub $push1=, $2, $pop0
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
   %mul = fmul contract <8 x half> %b, %a
   %sub = fsub contract <8 x half> %c, %mul
   ret <8 x half> %sub
@@ -84,9 +83,9 @@ define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fsub_fmul_contract_8xf32:
 ; RELAXED:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -110,7 +109,7 @@ define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; RELAXED-LABEL: fsub_fmul_contract_2xf64:
 ; RELAXED:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_2xf64:
@@ -143,3 +142,55 @@ define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
   ret float %sub
 }
 
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+  %fneg = fneg <8 x half> %a
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %fneg, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_4xf32:
+; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_4xf32:
+; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <4 x float> %a
+  %fma = call <4 x float> @llvm.fmuladd(<4 x float> %fneg, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <2 x double> %a
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %fneg, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
index a0c243b..f3950b7 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -1,16 +1,15 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;; A minimal test case. llc will crash if global variables already has a section
-;; prefix. Subsequent PRs will expand on this test case to test the hotness
-;; reconciliation implementation.
-
-; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+;; A minimal test case. Subsequent PRs will expand on this test case
+;; (e.g., with more functions, variables and profiles) and test the hotness
+;; reconcillation implementation.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
 ; RUN:     -partition-static-data-sections=true \
 ; RUN:     -data-sections=true  -unique-section-names=false \
-; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=IR
 
-; ERR: Global variable hot_bss already has a section prefix hot
+; IR: .section .bss.hot.,"aw"
 
 @hot_bss = internal global i32 0, !section_prefix !17
 
diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll
index ce06d17..604b4fd 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition.ll
@@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu"
 ; UNIQ-NEXT:   .section	.data.unlikely.,"aw",@progbits,unique,8
 ; AGG-NEXT:    .section	.data.unlikely.,"aw",@progbits
 
+;; The `.section` directive is omitted for .data with -unique-section-names=false.
+; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; For @data_with_unknown_hotness
 ; SYM: 	       .type	.Ldata_with_unknown_hotness,@object          # @data_with_unknown_hotness
 ; SYM:         .section .data..Ldata_with_unknown_hotness,"aw",@progbits
 ; UNIQ:        .section  .data,"aw",@progbits,unique,9
-; The `.section` directive is omitted for .data with -unique-section-names=false.
-; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; AGG:         .data
 ; COMMON:      .Ldata_with_unknown_hotness:
 
-; For @hot_data_custom_bar_section
-; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix.
+; For variables that are not eligible for section prefix annotation
 ; COMMON:      .type hot_data_custom_bar_section,@object
 ; SYM-NEXT:    .section bar,"aw",@progbits
 ; SYM:         hot_data_custom_bar_section
 ; UNIQ:        .section bar,"aw",@progbits
 ; AGG:         .section bar,"aw",@progbits
 
+; SYM:      .section .data.llvm.fake_var,"aw"
+; UNIQ:     .section .data,"aw"
+; AGG:      .data
+
+;; No section for linker declaration
+; COMMON-NOT:  qux
+
 @.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1
 @.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1
 @hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3]
@@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @data3 = internal global i32 3
 @data_with_unknown_hotness = private global i32 5
 @hot_data_custom_bar_section = internal global i32 101 #0
+@llvm.fake_var = internal global i32 123
+@qux = external global i64
 
 define void @cold_func(i32 %0) !prof !15 {
   %2 = load i32, ptr @cold_bss
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 5aa266d..69abf6e 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1447,3 +1447,158 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
   %r = icmp eq i512 %a, %b
   ret i1 %r
 }
+
+; Tests for any/allbits from memory.
+
+define i1 @anybits_i128_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i128_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    orq 8(%rdi), %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp ne i128 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i128_load_arg(ptr %w) {
+; SSE2-LABEL: allbits_i128_load_arg:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqb (%rdi), %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: allbits_i128_load_arg:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    ptest %xmm1, %xmm0
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    retq
+;
+; AVXANY-LABEL: allbits_i128_load_arg:
+; AVXANY:       # %bb.0:
+; AVXANY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVXANY-NEXT:    vptest %xmm1, %xmm0
+; AVXANY-NEXT:    setb %al
+; AVXANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp eq i128 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i256_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i256_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    movq 8(%rdi), %rcx
+; ANY-NEXT:    orq 24(%rdi), %rcx
+; ANY-NEXT:    orq 16(%rdi), %rax
+; ANY-NEXT:    orq %rcx, %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp ne i256 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i256_load_arg(ptr %w) {
+; SSE-LABEL: allbits_i256_load_arg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    andq 24(%rdi), %rcx
+; SSE-NEXT:    andq 16(%rdi), %rax
+; SSE-NEXT:    andq %rcx, %rax
+; SSE-NEXT:    cmpq $-1, %rax
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: allbits_i256_load_arg:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vptest %ymm1, %ymm0
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: allbits_i256_load_arg:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vptest %ymm1, %ymm0
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i256_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp eq i256 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i512_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i512_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq 16(%rdi), %rax
+; ANY-NEXT:    movq (%rdi), %rcx
+; ANY-NEXT:    movq 8(%rdi), %rdx
+; ANY-NEXT:    movq 24(%rdi), %rsi
+; ANY-NEXT:    orq 56(%rdi), %rsi
+; ANY-NEXT:    orq 40(%rdi), %rdx
+; ANY-NEXT:    orq %rsi, %rdx
+; ANY-NEXT:    orq 48(%rdi), %rax
+; ANY-NEXT:    orq 32(%rdi), %rcx
+; ANY-NEXT:    orq %rax, %rcx
+; ANY-NEXT:    orq %rdx, %rcx
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp ne i512 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i512_load_arg(ptr %w) {
+; NO512-LABEL: allbits_i512_load_arg:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 16(%rdi), %rax
+; NO512-NEXT:    movq (%rdi), %rcx
+; NO512-NEXT:    movq 8(%rdi), %rdx
+; NO512-NEXT:    movq 24(%rdi), %rsi
+; NO512-NEXT:    andq 56(%rdi), %rsi
+; NO512-NEXT:    andq 40(%rdi), %rdx
+; NO512-NEXT:    andq %rsi, %rdx
+; NO512-NEXT:    andq 48(%rdi), %rax
+; NO512-NEXT:    andq 32(%rdi), %rcx
+; NO512-NEXT:    andq %rax, %rcx
+; NO512-NEXT:    andq %rdx, %rcx
+; NO512-NEXT:    cmpq $-1, %rcx
+; NO512-NEXT:    sete %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i512_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
+; AVX512-NEXT:    vpcmpneqd (%rdi), %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp eq i512 %ld, -1
+  ret i1 %cmp
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4b..57da338 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -917,11 +917,11 @@ main:
     # CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02]
     f16x8.nearest
 
-    # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02]
-    f16x8.relaxed_madd
+    # CHECK: f16x8.madd # encoding: [0xfd,0xce,0x02]
+    f16x8.madd
 
-    # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02]
-    f16x8.relaxed_nmadd
+    # CHECK: f16x8.nmadd # encoding: [0xfd,0xcf,0x02]
+    f16x8.nmadd
 
     # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02]
     i16x8.trunc_sat_f16x8_s
diff --git a/llvm/test/TableGen/listsplat.td b/llvm/test/TableGen/listsplat.td
index 5a93a4c..43803d6 100644
--- a/llvm/test/TableGen/listsplat.td
+++ b/llvm/test/TableGen/listsplat.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// RUN: not llvm-tblgen -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s
 
 // CHECK: ------------- Classes -----------------
 // CHECK-NEXT: class X<int X:a = ?, int X:b = ?> {
@@ -73,3 +74,8 @@ def DYa1 : Y<"a", 1>;
 def DYa2 : Y<"a", 2>;
 
 def DZ : X<42, !size([1, 2, 3])>;
+
+#ifdef ERROR1
+// ERROR1: !listsplat count -1 is negative
+defvar E = !listsplat("", -1);
+#endif
diff --git a/llvm/test/Transforms/PGOProfile/data-access-profile.ll b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
index 29198f34..205184b 100644
--- a/llvm/test/Transforms/PGOProfile/data-access-profile.ll
+++ b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
@@ -3,55 +3,72 @@
 
 ; RUN: rm -rf %t && split-file %s %t && cd %t
 
-;; Read a text profile and merge it into indexed profile.
+;; Read text profiles and merge them into indexed profiles.
 ; RUN: llvm-profdata merge --memprof-version=4 memprof.yaml -o memprof.profdata
+; RUN: llvm-profdata merge --memprof-version=4 memprof-no-dap.yaml -o memprof-no-dap.profdata
 
 ;; Run optimizer pass on an IR module without IR functions, and test that global
 ;; variables in the module could be annotated (i.e., no early return),
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
 
 ;; Run optimizer pass on the IR, and check the section prefix.
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
 
-;; Run optimizer pass without explicitly setting -memprof-annotate-static-data-prefix.
-;; The output text IR shouldn't have `section_prefix`
+;; Run memprof without providing memprof data. Test that IR has module flag
+;; `EnableDataAccessProf` as 0.
+; RUN: opt -passes='memprof-use<profile-filename=memprof-no-dap.profdata>' -memprof-annotate-static-data-prefix \
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefix=FLAG
+
+;; Run memprof without explicitly setting -memprof-annotate-static-data-prefix.
+;; The output text IR shouldn't have `section_prefix` or EnableDataAccessProf module flag.
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' \
-; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --implicit-check-not="section_prefix"
+; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --check-prefix=FLAGLESS --implicit-check-not="section_prefix"
 
 ; LOG: Skip annotating string literal .str
 ; LOG: Global variable var1 is annotated as hot
 ; LOG: Global variable var2.llvm.125 is annotated as hot
 ; LOG: Global variable bar is not annotated
 ; LOG: Global variable foo is annotated as unlikely
-; LOG: Global variable var3 has explicit section name. Skip annotating.
-; LOG: Global variable var4 has explicit section name. Skip annotating.
+; LOG: Skip annotation for var3 due to explicit section name.
+; LOG: Skip annotation for var4 due to explicit section name.
+; LOG: Skip annotation for llvm.fake_var due to name starts with `llvm.`.
+; LOG: Skip annotation for qux due to linker declaration.
 
 ;; String literals are not annotated.
-; PREFIX: @.str = unnamed_addr constant [5 x i8] c"abcde"
-; PREFIX-NOT: section_prefix
-; PREFIX: @var1 = global i32 123, !section_prefix !0
+; IR: @.str = unnamed_addr constant [5 x i8] c"abcde"
+; IR-NOT: section_prefix
+; IR: @var1 = global i32 123, !section_prefix !0
 
 ;; @var.llvm.125 will be canonicalized to @var2 for profile look-up.
-; PREFIX-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
+; IR-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
 
 ;; @bar is not seen in hot symbol or known symbol set, so it won't get a section
 ;; prefix. Test this by testing that there is no section_prefix between @bar and
 ;; @foo.
-; PREFIX-NEXT: @bar = global i16 3
-; PREFIX-NOT: !section_prefix
+; IR-NEXT: @bar = global i16 3
+; IR-NOT: !section_prefix
 
 ;; @foo is unlikely.
-; PREFIX-NEXT: @foo = global i8 2, !section_prefix !1
+; IR-NEXT: @foo = global i8 2, !section_prefix !1
+
+; IR-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
+; IR-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+
+; IR: @llvm.fake_var = global i32 123
+; IR-NOT: !section_prefix
+; IR: @qux = external global i64
+; IR-NOT: !section_prefix
 
-; PREFIX-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
-; PREFIX-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+; IR: attributes #0 = { "rodata-section"="sec2" }
 
-; PREFIX: attributes #0 = { "rodata-section"="sec2" }
+; IR: !0 = !{!"section_prefix", !"hot"}
+; IR-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; IR-NEXT: !2 = !{i32 2, !"EnableDataAccessProf", i32 1}
 
-; PREFIX: !0 = !{!"section_prefix", !"hot"}
-; PREFIX-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; FLAG: !{i32 2, !"EnableDataAccessProf", i32 0}
+; FLAGLESS-NOT: EnableDataAccessProf
 
 ; STAT: 1 memprof - Number of global vars annotated with 'unlikely' section prefix.
 ; STAT: 2 memprof - Number of global vars with user-specified section (not annotated).
@@ -72,6 +89,24 @@ DataAccessProfiles:
     - foo
   KnownColdStrHashes: [ 999, 1001 ]
 ...
+;--- memprof-no-dap.yaml
+---
+# A memprof file with without data access profiles. The heap records are simplified
+# to pass profile parsing and don't need to match the IR.
+HeapProfileRecords:
+  - GUID:            0xdeadbeef12345678
+    AllocSites:
+      - Callstack:
+          - { Function: 0x1111111111111111, LineOffset: 11, Column: 10, IsInlineFrame: true }
+        MemInfoBlock:
+          AllocCount:      111
+          TotalSize:       222
+          TotalLifetime:   333
+          TotalLifetimeAccessDensity: 444
+    CallSites:
+      - Frames:
+        - { Function: 0x5555555555555555, LineOffset: 55, Column: 50, IsInlineFrame: true }
+...
 ;--- input.ll
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -84,11 +119,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @foo = global i8 2
 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
 @var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
 
 define i32 @func() {
   %a = load i32, ptr @var1
   %b = load i32, ptr @var2.llvm.125
-  %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b)
+  %c = load i32, ptr @llvm.fake_var
+  %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b, i32 %c)
   ret i32 %ret
 }
 
@@ -108,5 +146,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @foo = global i8 2
 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
 @var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
+
 
 attributes #0 = { "rodata-section"="sec2" }
diff --git a/llvm/unittests/ADT/BitFieldsTest.cpp b/llvm/unittests/ADT/BitFieldsTest.cpp
index 3062d5d..ae541fe 100644
--- a/llvm/unittests/ADT/BitFieldsTest.cpp
+++ b/llvm/unittests/ADT/BitFieldsTest.cpp
@@ -247,8 +247,8 @@ TEST(BitfieldsTest, ValueTooBigBounded) {
   Bitfield::set<A>(Storage, 0);
   Bitfield::set<A>(Storage, -1);
   Bitfield::set<A>(Storage, -2);
-  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is too big");
-  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is too small");
+  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is out of range");
+  EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is out of range");
 }
 
 #endif
diff --git a/llvm/unittests/IR/ConstantFPRangeTest.cpp b/llvm/unittests/IR/ConstantFPRangeTest.cpp
index 2431db9..67fee96 100644
--- a/llvm/unittests/IR/ConstantFPRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantFPRangeTest.cpp
@@ -1066,6 +1066,115 @@ TEST_F(ConstantFPRangeTest, sub) {
 #endif
 }
 
+TEST_F(ConstantFPRangeTest, mul) {
+  EXPECT_EQ(Full.mul(Full), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(Full.mul(Empty), Empty);
+  EXPECT_EQ(Empty.mul(Full), Empty);
+  EXPECT_EQ(Empty.mul(Empty), Empty);
+  EXPECT_EQ(One.mul(One), ConstantFPRange(APFloat(1.0)));
+  EXPECT_EQ(Some.mul(Some),
+            ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(9.0)));
+  EXPECT_EQ(SomePos.mul(SomeNeg),
+            ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(-0.0)));
+  EXPECT_EQ(PosInf.mul(PosInf), PosInf);
+  EXPECT_EQ(NegInf.mul(NegInf), PosInf);
+  EXPECT_EQ(PosInf.mul(Finite), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(NegInf.mul(Finite), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(PosInf.mul(NegInf), NegInf);
+  EXPECT_EQ(NegInf.mul(PosInf), NegInf);
+  EXPECT_EQ(PosZero.mul(NegZero), NegZero);
+  EXPECT_EQ(PosZero.mul(Zero), Zero);
+  EXPECT_EQ(NegZero.mul(NegZero), PosZero);
+  EXPECT_EQ(NegZero.mul(Zero), Zero);
+  EXPECT_EQ(NaN.mul(NaN), QNaN);
+  EXPECT_EQ(NaN.mul(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+  EnumerateTwoInterestingConstantFPRanges(
+      [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+        ConstantFPRange Res = LHS.mul(RHS);
+        ConstantFPRange Expected =
+            ConstantFPRange::getEmpty(LHS.getSemantics());
+        EnumerateValuesInConstantFPRange(
+            LHS,
+            [&](const APFloat &LHSC) {
+              EnumerateValuesInConstantFPRange(
+                  RHS,
+                  [&](const APFloat &RHSC) {
+                    APFloat Prod = LHSC * RHSC;
+                    EXPECT_TRUE(Res.contains(Prod))
+                        << "Wrong result for " << LHS << " * " << RHS
+                        << ". The result " << Res << " should contain " << Prod;
+                    if (!Expected.contains(Prod))
+                      Expected = Expected.unionWith(ConstantFPRange(Prod));
+                  },
+                  /*IgnoreNaNPayload=*/true);
+            },
+            /*IgnoreNaNPayload=*/true);
+        EXPECT_EQ(Res, Expected)
+            << "Suboptimal result for " << LHS << " * " << RHS << ". Expected "
+            << Expected << ", but got " << Res;
+      },
+      SparseLevel::SpecialValuesOnly);
+#endif
+}
+
+TEST_F(ConstantFPRangeTest, div) {
+  EXPECT_EQ(Full.div(Full), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(Full.div(Empty), Empty);
+  EXPECT_EQ(Empty.div(Full), Empty);
+  EXPECT_EQ(Empty.div(Empty), Empty);
+  EXPECT_EQ(One.div(One), ConstantFPRange(APFloat(1.0)));
+  EXPECT_EQ(Some.div(Some), NonNaN.unionWith(QNaN));
+  EXPECT_EQ(SomePos.div(SomeNeg),
+            ConstantFPRange(APFloat::getInf(Sem, /*Negative=*/true),
+                            APFloat::getZero(Sem, /*Negative=*/true),
+                            /*MayBeQNaN=*/true, /*MayBeSNaN=*/false));
+  EXPECT_EQ(PosInf.div(PosInf), QNaN);
+  EXPECT_EQ(NegInf.div(NegInf), QNaN);
+  EXPECT_EQ(PosInf.div(Finite), NonNaN);
+  EXPECT_EQ(NegInf.div(Finite), NonNaN);
+  EXPECT_EQ(PosInf.div(NegInf), QNaN);
+  EXPECT_EQ(NegInf.div(PosInf), QNaN);
+  EXPECT_EQ(Zero.div(Zero), QNaN);
+  EXPECT_EQ(SomePos.div(PosInf), PosZero);
+  EXPECT_EQ(SomeNeg.div(PosInf), NegZero);
+  EXPECT_EQ(PosInf.div(SomePos), PosInf);
+  EXPECT_EQ(NegInf.div(SomeNeg), PosInf);
+  EXPECT_EQ(NegInf.div(Some), NonNaN);
+  EXPECT_EQ(NaN.div(NaN), QNaN);
+  EXPECT_EQ(NaN.div(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+  EnumerateTwoInterestingConstantFPRanges(
+      [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+        ConstantFPRange Res = LHS.div(RHS);
+        ConstantFPRange Expected =
+            ConstantFPRange::getEmpty(LHS.getSemantics());
+        EnumerateValuesInConstantFPRange(
+            LHS,
+            [&](const APFloat &LHSC) {
+              EnumerateValuesInConstantFPRange(
+                  RHS,
+                  [&](const APFloat &RHSC) {
+                    APFloat Val = LHSC / RHSC;
+                    EXPECT_TRUE(Res.contains(Val))
+                        << "Wrong result for " << LHS << " / " << RHS
+                        << ". The result " << Res << " should contain " << Val;
+                    if (!Expected.contains(Val))
+                      Expected = Expected.unionWith(ConstantFPRange(Val));
+                  },
+                  /*IgnoreNaNPayload=*/true);
+            },
+            /*IgnoreNaNPayload=*/true);
+        EXPECT_EQ(Res, Expected)
+            << "Suboptimal result for " << LHS << " / " << RHS << ". Expected "
+            << Expected << ", but got " << Res;
+      },
+      SparseLevel::SpecialValuesOnly);
+#endif
+}
+
 TEST_F(ConstantFPRangeTest, flushDenormals) {
   const fltSemantics &FP8Sem = APFloat::Float8E4M3();
   APFloat NormalVal = APFloat::getSmallestNormalized(FP8Sem);
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index bdcb8a3..343c2bb71 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1129,6 +1129,7 @@ Transforms/LowerIFunc/ifunc-alias.ll
 Transforms/LowerIFunc/ifunc-nonsense-resolvers.ll
 Transforms/LowerIFunc/ifunc-program-addrspace.ll
 Transforms/LowerIFunc/lower-ifunc.ll
+Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
 Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
 Transforms/LowerMatrixIntrinsics/multiply-fused.ll
 Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -1311,82 +1312,6 @@ Transforms/SimpleLoopUnswitch/pr60736.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
-Transforms/SLPVectorizer/AArch64/gather-root.ll
-Transforms/SLPVectorizer/AArch64/horizontal.ll
-Transforms/SLPVectorizer/AArch64/loadi8.ll
-Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/AArch64/uselistorder.ll
-Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
-Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
-Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
-Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll
-Transforms/SLPVectorizer/const-bool-logical-or-reduction.ll
-Transforms/SLPVectorizer/extracts-with-undefs.ll
-Transforms/SLPVectorizer/freeze-signedness-missed.ll
-Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
-Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll
-Transforms/SLPVectorizer/insert-element-build-vector-const.ll
-Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
-Transforms/SLPVectorizer/insert-element-build-vector.ll
-Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
-Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
-Transforms/SLPVectorizer/minbitwidth-user-not-min.ll
-Transforms/SLPVectorizer/partial-register-extract.ll
-Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
-Transforms/SLPVectorizer/reorder-node.ll
-Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll
-Transforms/SLPVectorizer/revec.ll
-Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
-Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
-Transforms/SLPVectorizer/RISCV/reordered-interleaved-loads.ll
-Transforms/SLPVectorizer/RISCV/revec.ll
-Transforms/SLPVectorizer/RISCV/select-profitability.ll
-Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
-Transforms/SLPVectorizer/RISCV/unsigned-node-trunc-with-signed-users.ll
-Transforms/SLPVectorizer/slp-deleted-inst.ll
-Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
-Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
-Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll
-Transforms/SLPVectorizer/X86/bool-mask.ll
-Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
-Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
-Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
-Transforms/SLPVectorizer/X86/cmp_sel.ll
-Transforms/SLPVectorizer/X86/crash_7zip.ll
-Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
-Transforms/SLPVectorizer/X86/crash_cmpop.ll
-Transforms/SLPVectorizer/X86/debug-counter.ll
-Transforms/SLPVectorizer/X86/debug-info-salvage.ll
-Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
-Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
-Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
-Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
-Transforms/SLPVectorizer/X86/horizontal-minmax.ll
-Transforms/SLPVectorizer/X86/insert-after-bundle.ll
-Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
-Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
-Transforms/SLPVectorizer/X86/minbw-user-non-sizable.ll
-Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
-Transforms/SLPVectorizer/X86/ordering-bug.ll
-Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll
-Transforms/SLPVectorizer/X86/pr46983.ll
-Transforms/SLPVectorizer/X86/pr49933.ll
-Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
-Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
-Transforms/SLPVectorizer/X86/reduction-logical.ll
-Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
-Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
-Transforms/SLPVectorizer/X86/select-reduction-op.ll
-Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
-Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
-Transforms/SLPVectorizer/X86/undef_vect.ll
-Transforms/SLPVectorizer/X86/used-reduced-op.ll
-Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
-Transforms/SLPVectorizer/X86/whole-registers-compare.ll
 Transforms/SROA/addrspacecast.ll
 Transforms/SROA/phi-and-select.ll
 Transforms/SROA/phi-gep.ll