122 files changed, 4401 insertions, 1441 deletions
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index c84e2cc..22ecf4d 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -948,6 +948,15 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
   # Enable -Wstring-conversion to catch misuse of string literals.
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     append("-Wstring-conversion" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+
+    # Disable -Wno-pass-failed flag, which reports failure to perform
+    # optimizations suggested by pragmas. This warning is not relevant for LLVM
+    # projects and may be injected by pragmas in libstdc++.
+    # FIXME: Reconsider this choice if warnings from STL headers can be reliably
+    # avoided (https://github.com/llvm/llvm-project/issues/157666).
+    # This option has been available since Clang 3.5, and we do require a newer
+    # version.
+    append("-Wno-pass-failed" CMAKE_CXX_FLAGS)
   endif()
 
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -962,13 +971,6 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
 
   # Enable -Wctad-maybe-unsupported to catch unintended use of CTAD.
   add_flag_if_supported("-Wctad-maybe-unsupported" CTAD_MAYBE_UNSPPORTED_FLAG)
-
-  # Disable -Wno-pass-failed flag, which reports failure to perform
-  # optimizations suggested by pragmas. This warning is not relevant for LLVM
-  # projects and may be injected by pragmas in libstdc++.
-  # FIXME: Reconsider this choice if warnings from STL headers can be reliably
-  # avoided (https://github.com/llvm/llvm-project/issues/157666).
-  add_flag_if_supported("-Wno-pass-failed" NO_PASS_FAILED_FLAG)
 endif (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
 
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE AND NOT LLVM_ENABLE_WARNINGS)
diff --git a/llvm/docs/CallGraphSection.md b/llvm/docs/CallGraphSection.md
new file mode 100644
index 0000000..8b18727
--- /dev/null
+++ b/llvm/docs/CallGraphSection.md
@@ -0,0 +1,18 @@
+# .callgraph Section Layout
+
+The `.callgraph` section is used to store call graph information for each function. The section contains a series of records, with each record corresponding to a single function.
+
+## Per Function Record Layout
+
+Each record in the `.callgraph` section has the following binary layout:
+
+| Field                                  | Type          | Size (bits) | Description                                                                                             |
+| -------------------------------------- | ------------- | ----------- | ------------------------------------------------------------------------------------------------------- |
+| Format Version                         | `uint8_t`     | 8           | The version of the record format. The current version is 0.                                             |
+| Flags                                  | `uint8_t`     | 8           | A bitfield where: Bit 0 is set if the function is a potential indirect call target; Bit 1 is set if there are direct callees; Bit 2 is set if there are indirect callees. The remaining 5 bits are reserved. |
+| Function Entry PC                      | `uintptr_t`   | 32/64       | The address of the function's entry point.                                                              |
+| Function Type ID                       | `uint64_t`    | 64          | The type ID of the function. This field is non-zero if the function is a potential indirect call target and its type is known. |
+| Number of Unique Direct Callees        | `ULEB128`     | Variable    | The number of unique direct call destinations from this function. This field is only present if there is at least one direct callee. |
+| Direct Callees Array                   | `uintptr_t[]` | Variable    | An array of unique direct callee entry point addresses. This field is only present if there is at least one direct callee. |
+| Number of Unique Indirect Target Type IDs| `ULEB128`     | Variable    | The number of unique indirect call target type IDs. This field is only present if there is at least one indirect target type ID. |
+| Indirect Target Type IDs Array         | `uint64_t[]`  | Variable    | An array of unique indirect call target type IDs. This field is only present if there is at least one indirect target type ID. |
diff --git a/llvm/docs/CodeGenerator.rst b/llvm/docs/CodeGenerator.rst
index 7486054..fc704a3 100644
--- a/llvm/docs/CodeGenerator.rst
+++ b/llvm/docs/CodeGenerator.rst
@@ -1662,6 +1662,13 @@ and stack sizes (unsigned LEB128). The stack size values only include the space
 allocated in the function prologue. Functions with dynamic stack allocations are
 not included.
 
+Emitting function call graph information
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A section containing metadata on function call graph will be emitted when
+``TargetOptions::EmitCallGraphSection`` is set (--call-graph-section). Layout of
+this section is documented in detail at :doc:`CallGraphSection`.
+
 VLIW Packetizer
 ---------------
 
diff --git a/llvm/docs/QualGroup.rst b/llvm/docs/QualGroup.rst
index b45f569..5c05e4e 100644
--- a/llvm/docs/QualGroup.rst
+++ b/llvm/docs/QualGroup.rst
@@ -75,6 +75,16 @@ They meet the criteria for inclusion below. Knowing their handles help us keep t
      - capitan-davide
      - capitan_davide
      - capitan-davide
+   * - Jorge Pinto Sousa
+     - Critical Techworks
+     - sousajo-cc
+     - sousajo-cc
+     - sousajo-cc
+   * - José Rui Simões
+     - Critical Software
+     - jr-simoes
+     - jr_simoes
+     - iznogoud-zz
    * - Oscar Slotosch
      - Validas
      - slotosch
@@ -100,6 +110,11 @@ They meet the criteria for inclusion below. Knowing their handles help us keep t
      - YoungJunLee
      - YoungJunLee
      - IamYJLee
+   * - Zaky Hermawan
+     - No Affiliation
+     - ZakyHermawan
+     - quarkz99
+     - zakyHermawan
 
 
 Organizations are limited to three representatives within the group to maintain diversity.
diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
index 5d842d3..9b1bf1b 100644
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -15,6 +15,7 @@ LLVM and API reference documentation.
    BranchWeightMetadata
    Bugpoint
    CalleeTypeMetadata
+   CallGraphSection
    CIBestPractices
    CommandGuide/index
    ContentAddressableStorage
diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h
index 310539f..017585a4 100644
--- a/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/llvm/include/llvm/ADT/ImmutableSet.h
@@ -21,9 +21,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Signals.h"
 #include <cassert>
 #include <cstdint>
 #include <functional>
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 815e85d..5039a3f 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -390,7 +390,7 @@ toDW_LNAME(SourceLanguage language) {
   case DW_LANG_C11:
     return {{DW_LNAME_C, 201112}};
   case DW_LANG_C17:
-    return {{DW_LNAME_C, 201712}};
+    return {{DW_LNAME_C, 201710}};
   case DW_LANG_C_plus_plus:
     return {{DW_LNAME_C_plus_plus, 0}};
   case DW_LANG_C_plus_plus_03:
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 4c744a2..19ca444 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -198,26 +198,13 @@ private:
     /// and targets.
     using CGTypeId = uint64_t;
 
-    /// Map type identifiers to callsite labels. Labels are generated for each
-    /// indirect callsite in the function.
-    SmallVector<std::pair<CGTypeId, MCSymbol *>> CallSiteLabels;
+    /// Unique target type IDs.
+    SmallSet<CGTypeId, 4> IndirectCalleeTypeIDs;
+    /// Unique direct callees.
     SmallSet<MCSymbol *, 4> DirectCallees;
   };
 
-  /// Enumeration of function kinds, and their mapping to function kind values
-  /// stored in callgraph section entries.
-  enum class FunctionKind : uint64_t {
-    /// Function cannot be target to indirect calls.
-    NOT_INDIRECT_TARGET = 0,
-
-    /// Function may be target to indirect calls but its type id is unknown.
-    INDIRECT_TARGET_UNKNOWN_TID = 1,
-
-    /// Function may be target to indirect calls and its type id is known.
-    INDIRECT_TARGET_KNOWN_TID = 2,
-  };
-
-  enum CallGraphSectionFormatVersion : uint64_t {
+  enum CallGraphSectionFormatVersion : uint8_t {
     V_0 = 0,
   };
 
@@ -386,9 +373,9 @@ public:
   /// are available. Returns empty string otherwise.
   StringRef getConstantSectionSuffix(const Constant *C) const;
 
-  /// Iff MI is an indirect call, generate and emit a label after the callsites
-  /// which will be used to populate the .callgraph section. For direct
-  /// callsites add the callee symbol to direct callsites list of FuncCGInfo.
+  /// If MI is an indirect call, add expected type IDs to indirect type ids
+  /// list. If MI is a direct call add the callee symbol to direct callsites
+  /// list of FuncCGInfo.
   void handleCallsiteForCallgraph(
       FunctionCallGraphInfo &FuncCGInfo,
       const MachineFunction::CallSiteInfoMap &CallSitesInfoMap,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 88691b9..73f2c55 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -847,8 +847,7 @@ public:
   /// This is usually true on most targets. But some targets, like Thumb1,
   /// have immediate shift instructions, but no immediate "and" instruction;
   /// this makes the fold unprofitable.
-  virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N,
-                                                 CombineLevel Level) const {
+  virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N) const {
     return true;
   }
 
diff --git a/llvm/include/llvm/IR/ConstantFPRange.h b/llvm/include/llvm/IR/ConstantFPRange.h
index 4a54caa..d47f6c0 100644
--- a/llvm/include/llvm/IR/ConstantFPRange.h
+++ b/llvm/include/llvm/IR/ConstantFPRange.h
@@ -206,6 +206,22 @@ public:
 
   /// Calculate range of negated values.
   LLVM_ABI ConstantFPRange negate() const;
+
+  /// Get the range without NaNs. It is useful when we apply nnan flag to range
+  /// of operands/results.
+  ConstantFPRange getWithoutNaN() const {
+    return ConstantFPRange(Lower, Upper, false, false);
+  }
+
+  /// Get the range without infinities. It is useful when we apply ninf flag to
+  /// range of operands/results.
+  LLVM_ABI ConstantFPRange getWithoutInf() const;
+
+  /// Return a new range in the specified format with the specified rounding
+  /// mode.
+  LLVM_ABI ConstantFPRange
+  cast(const fltSemantics &DstSem,
+       APFloat::roundingMode RM = APFloat::rmNearestTiesToEven) const;
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const ConstantFPRange &CR) {
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index de7a237..27930bb 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1715,7 +1715,7 @@ public:
   static SelectInst *Create(Value *C, Value *S1, Value *S2,
                             const Twine &NameStr = "",
                             InsertPosition InsertBefore = nullptr,
-                            Instruction *MDFrom = nullptr) {
+                            const Instruction *MDFrom = nullptr) {
     SelectInst *Sel =
         new (AllocMarker) SelectInst(C, S1, S2, NameStr, InsertBefore);
     if (MDFrom)
diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h
index f87344e..70916d8 100644
--- a/llvm/include/llvm/Support/raw_ostream.h
+++ b/llvm/include/llvm/Support/raw_ostream.h
@@ -739,7 +739,7 @@ class LLVM_ABI raw_null_ostream : public raw_pwrite_stream {
   uint64_t current_pos() const override;
 
 public:
-  explicit raw_null_ostream() = default;
+  explicit raw_null_ostream() : raw_pwrite_stream(/*Unbuffered=*/true) {}
   ~raw_null_ostream() override;
 };
 
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 774063b..632be7a 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -119,11 +119,12 @@ def SDTIntShiftOp : SDTypeProfile<1, 2, [   // shl, sra, srl
   SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, SDTCisSameNumEltsAs<0, 2>
 ]>;
 def SDTIntShiftPairOp : SDTypeProfile<2, 3, [ // shl_parts, sra_parts, srl_parts
-  SDTCisInt<0>, SDTCisSameAs<1, 0>,
-  SDTCisSameAs<2, 0>, SDTCisSameAs<3, 0>, SDTCisInt<4>
+  SDTCisInt<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisSameAs<3, 0>,
+  SDTCisInt<4>, SDTCisSameNumEltsAs<0, 4>
 ]>;
 def SDTIntShiftDOp: SDTypeProfile<1, 3, [   // fshl, fshr
-  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>,
+  SDTCisSameNumEltsAs<0, 3>
 ]>;
 def SDTIntSatNoShOp : SDTypeProfile<1, 2, [   // ssat with no shift
   SDTCisSameAs<0, 1>, SDTCisInt<2>
@@ -139,7 +140,7 @@ def SDTFPBinOp : SDTypeProfile<1, 2, [      // fadd, fmul, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>
 ]>;
 def SDTFPSignOp : SDTypeProfile<1, 2, [     // fcopysign.
-  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisFP<2>
+  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisFP<2>, SDTCisSameNumEltsAs<0, 2>
 ]>;
 def SDTFPTernaryOp : SDTypeProfile<1, 3, [  // fmadd, fnmsub, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0>
@@ -148,7 +149,7 @@ def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // bitreverse
   SDTCisSameAs<0, 1>, SDTCisInt<0>
 ]>;
 def SDTIntBitCountUnaryOp : SDTypeProfile<1, 1, [   // ctlz, cttz
-  SDTCisInt<0>, SDTCisInt<1>
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 def SDTIntExtendOp : SDTypeProfile<1, 1, [  // sext, zext, anyext
   SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
@@ -300,10 +301,10 @@ def SDTVecInsert : SDTypeProfile<1, 3, [    // vector insert
   SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3>
 ]>;
 def SDTVecReduce : SDTypeProfile<1, 1, [    // vector reduction
-  SDTCisInt<0>, SDTCisVec<1>
+  SDTCisInt<0>, SDTCisVec<1>, SDTCisInt<1>
 ]>;
 def SDTFPVecReduce : SDTypeProfile<1, 1, [  // FP vector reduction
-  SDTCisFP<0>, SDTCisVec<1>
+  SDTCisFP<0>, SDTCisEltOfVec<0, 1>
 ]>;
 
 def SDTVecReverse : SDTypeProfile<1, 1, [  // vector reverse
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index ed2e01c..dc8cd86d 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -277,6 +277,7 @@ public:
     MuslF32,
     MuslSF,
     MuslX32,
+    MuslWALI,
     LLVM,
 
     MSVC,
@@ -798,6 +799,12 @@ public:
     return getObjectFormat() == Triple::DXContainer;
   }
 
+  /// Tests whether the target uses WALI Wasm
+  bool isWALI() const {
+    return getArch() == Triple::wasm32 && isOSLinux() &&
+           getEnvironment() == Triple::MuslWALI;
+  }
+
   /// Tests whether the target is the PS4 platform.
   bool isPS4() const {
     return getArch() == Triple::x86_64 &&
@@ -840,6 +847,7 @@ public:
            getEnvironment() == Triple::MuslF32 ||
            getEnvironment() == Triple::MuslSF ||
            getEnvironment() == Triple::MuslX32 ||
+           getEnvironment() == Triple::MuslWALI ||
            getEnvironment() == Triple::OpenHOS || isOSLiteOS();
   }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0ca55a26..54e916e 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -118,6 +118,10 @@ static cl::opt<bool>
 #endif
                                 cl::desc(""));
 
+static cl::opt<bool> PreserveBitcodeUseListOrder(
+    "preserve-bc-uselistorder", cl::Hidden, cl::init(true),
+    cl::desc("Preserve use-list order when writing LLVM bitcode."));
+
 namespace llvm {
 extern FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold;
 }
@@ -217,7 +221,10 @@ public:
                           bool ShouldPreserveUseListOrder,
                           const ModuleSummaryIndex *Index)
       : BitcodeWriterBase(Stream, StrtabBuilder), M(M),
-        VE(M, ShouldPreserveUseListOrder), Index(Index) {
+        VE(M, PreserveBitcodeUseListOrder.getNumOccurrences()
+                  ? PreserveBitcodeUseListOrder
+                  : ShouldPreserveUseListOrder),
+        Index(Index) {
     // Assign ValueIds to any callee values in the index that came from
     // indirect call profiles and were recorded as a GUID not a Value*
     // (which would have been assigned an ID by the ValueEnumerator).
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 10df9c1..219bbc9 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "WinException.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -205,6 +206,17 @@ public:
 };
 } // namespace
 
+namespace callgraph {
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+enum Flags : uint8_t {
+  None = 0,
+  IsIndirectTarget = 1u << 0,
+  HasDirectCallees = 1u << 1,
+  HasIndirectCallees = 1u << 2,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue*/ HasIndirectCallees)
+};
+} // namespace callgraph
+
 class llvm::AddrLabelMap {
   MCContext &Context;
   struct AddrLabelSymEntry {
@@ -1683,63 +1695,65 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
   OutStreamer->pushSection();
   OutStreamer->switchSection(FuncCGSection);
 
-  // Emit format version number.
-  OutStreamer->emitInt64(CallGraphSectionFormatVersion::V_0);
-
-  // Emit function's self information, which is composed of:
-  //  1) FunctionEntryPc
-  //  2) FunctionKind: Whether the function is indirect target, and if so,
-  //     whether its type id is known.
-  //  3) FunctionTypeId: Emit only when the function is an indirect target
-  //     and its type id is known.
-
-  // Emit function entry pc.
   const MCSymbol *FunctionSymbol = getFunctionBegin();
-  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
-
+  const Function &F = MF.getFunction();
   // If this function has external linkage or has its address taken and
   // it is not a callback, then anything could call it.
-  const Function &F = MF.getFunction();
   bool IsIndirectTarget =
       !F.hasLocalLinkage() || F.hasAddressTaken(nullptr,
                                                 /*IgnoreCallbackUses=*/true,
                                                 /*IgnoreAssumeLikeCalls=*/true,
                                                 /*IgnoreLLVMUsed=*/false);
 
-  // FIXME: FunctionKind takes a few values but emitted as a 64-bit value.
-  // Can be optimized to occupy 2 bits instead.
-  // Emit function kind, and type id if available.
-  if (!IsIndirectTarget) {
-    OutStreamer->emitInt64(
-        static_cast<uint64_t>(FunctionKind::NOT_INDIRECT_TARGET));
-  } else {
-    if (const auto *TypeId = extractNumericCGTypeId(F)) {
-      OutStreamer->emitInt64(
-          static_cast<uint64_t>(FunctionKind::INDIRECT_TARGET_KNOWN_TID));
-      OutStreamer->emitInt64(TypeId->getZExtValue());
-    } else {
-      OutStreamer->emitInt64(
-          static_cast<uint64_t>(FunctionKind::INDIRECT_TARGET_UNKNOWN_TID));
-    }
-  }
+  const auto &DirectCallees = FuncCGInfo.DirectCallees;
+  const auto &IndirectCalleeTypeIDs = FuncCGInfo.IndirectCalleeTypeIDs;
+
+  using namespace callgraph;
+  Flags CGFlags = Flags::None;
+  if (IsIndirectTarget)
+    CGFlags |= Flags::IsIndirectTarget;
+  if (DirectCallees.size() > 0)
+    CGFlags |= Flags::HasDirectCallees;
+  if (IndirectCalleeTypeIDs.size() > 0)
+    CGFlags |= Flags::HasIndirectCallees;
+
+  // Emit function's call graph information.
+  // 1) CallGraphSectionFormatVersion
+  // 2) Flags
+  //    a. LSB bit 0 is set to 1 if the function is a potential indirect
+  //       target.
+  //    b. LSB bit 1 is set to 1 if there are direct callees.
+  //    c. LSB bit 2 is set to 1 if there are indirect callees.
+  //    d. Rest of the 5 bits in Flags are reserved for any future use.
+  // 3) Function entry PC.
+  // 4) FunctionTypeID if the function is indirect target and its type id
+  //    is known, otherwise it is set to 0.
+  // 5) Number of unique direct callees, if at least one exists.
+  // 6) For each unique direct callee, the callee's PC.
+  // 7) Number of unique indirect target type IDs, if at least one exists.
+  // 8) Each unique indirect target type id.
+  OutStreamer->emitInt8(CallGraphSectionFormatVersion::V_0);
+  OutStreamer->emitInt8(static_cast<uint8_t>(CGFlags));
+  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+  const auto *TypeId = extractNumericCGTypeId(F);
+  if (IsIndirectTarget && TypeId)
+    OutStreamer->emitInt64(TypeId->getZExtValue());
+  else
+    OutStreamer->emitInt64(0);
 
-  // Emit callsite labels, where each element is a pair of type id and
-  // indirect callsite pc.
-  const auto &CallSiteLabels = FuncCGInfo.CallSiteLabels;
-  OutStreamer->emitInt64(CallSiteLabels.size());
-  for (const auto &[TypeId, Label] : CallSiteLabels) {
-    OutStreamer->emitInt64(TypeId);
-    OutStreamer->emitSymbolValue(Label, TM.getProgramPointerSize());
+  if (DirectCallees.size() > 0) {
+    OutStreamer->emitULEB128IntValue(DirectCallees.size());
+    for (const auto &CalleeSymbol : DirectCallees)
+      OutStreamer->emitSymbolValue(CalleeSymbol, TM.getProgramPointerSize());
+    FuncCGInfo.DirectCallees.clear();
   }
-  FuncCGInfo.CallSiteLabels.clear();
-
-  const auto &DirectCallees = FuncCGInfo.DirectCallees;
-  OutStreamer->emitInt64(DirectCallees.size());
-  for (const auto &CalleeSymbol : DirectCallees) {
-    OutStreamer->emitSymbolValue(CalleeSymbol, TM.getProgramPointerSize());
+  if (IndirectCalleeTypeIDs.size() > 0) {
+    OutStreamer->emitULEB128IntValue(IndirectCalleeTypeIDs.size());
+    for (const auto &CalleeTypeId : IndirectCalleeTypeIDs)
+      OutStreamer->emitInt64(CalleeTypeId);
+    FuncCGInfo.IndirectCalleeTypeIDs.clear();
   }
-  FuncCGInfo.DirectCallees.clear();
-
+  // End of emitting call graph section contents.
   OutStreamer->popSection();
 }
 
@@ -1877,8 +1891,7 @@ void AsmPrinter::handleCallsiteForCallgraph(
     FunctionCallGraphInfo &FuncCGInfo,
     const MachineFunction::CallSiteInfoMap &CallSitesInfoMap,
     const MachineInstr &MI) {
-  assert(MI.isCall() &&
-         "Callsite labels are meant for call instructions only.");
+  assert(MI.isCall() && "This method is meant for call instructions only.");
   const MachineOperand &CalleeOperand = MI.getOperand(0);
   if (CalleeOperand.isGlobal() || CalleeOperand.isSymbol()) {
     // Handle direct calls.
@@ -1903,10 +1916,8 @@ void AsmPrinter::handleCallsiteForCallgraph(
   // Handle indirect callsite info.
   // Only indirect calls have type identifiers set.
   for (ConstantInt *CalleeTypeId : CallSiteInfo->second.CalleeTypeIds) {
-    MCSymbol *S = MF->getContext().createTempSymbol();
-    OutStreamer->emitLabel(S);
     uint64_t CalleeTypeIdVal = CalleeTypeId->getZExtValue();
-    FuncCGInfo.CallSiteLabels.emplace_back(CalleeTypeIdVal, S);
+    FuncCGInfo.IndirectCalleeTypeIDs.insert(CalleeTypeIdVal);
   }
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 12d749c..e57ed24 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -569,40 +569,30 @@ void CodeViewDebug::emitCodeViewMagicVersion() {
   OS.emitInt32(COFF::DEBUG_SECTION_MAGIC);
 }
 
-static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
-  switch (DWLang) {
-  case dwarf::DW_LANG_C:
-  case dwarf::DW_LANG_C89:
-  case dwarf::DW_LANG_C99:
-  case dwarf::DW_LANG_C11:
+static SourceLanguage
+MapDWARFLanguageToCVLang(dwarf::SourceLanguageName DWLName) {
+  switch (DWLName) {
+  case dwarf::DW_LNAME_C:
     return SourceLanguage::C;
-  case dwarf::DW_LANG_C_plus_plus:
-  case dwarf::DW_LANG_C_plus_plus_03:
-  case dwarf::DW_LANG_C_plus_plus_11:
-  case dwarf::DW_LANG_C_plus_plus_14:
+  case dwarf::DW_LNAME_C_plus_plus:
     return SourceLanguage::Cpp;
-  case dwarf::DW_LANG_Fortran77:
-  case dwarf::DW_LANG_Fortran90:
-  case dwarf::DW_LANG_Fortran95:
-  case dwarf::DW_LANG_Fortran03:
-  case dwarf::DW_LANG_Fortran08:
+  case dwarf::DW_LNAME_Fortran:
     return SourceLanguage::Fortran;
-  case dwarf::DW_LANG_Pascal83:
+  case dwarf::DW_LNAME_Pascal:
     return SourceLanguage::Pascal;
-  case dwarf::DW_LANG_Cobol74:
-  case dwarf::DW_LANG_Cobol85:
+  case dwarf::DW_LNAME_Cobol:
     return SourceLanguage::Cobol;
-  case dwarf::DW_LANG_Java:
+  case dwarf::DW_LNAME_Java:
     return SourceLanguage::Java;
-  case dwarf::DW_LANG_D:
+  case dwarf::DW_LNAME_D:
     return SourceLanguage::D;
-  case dwarf::DW_LANG_Swift:
+  case dwarf::DW_LNAME_Swift:
     return SourceLanguage::Swift;
-  case dwarf::DW_LANG_Rust:
+  case dwarf::DW_LNAME_Rust:
     return SourceLanguage::Rust;
-  case dwarf::DW_LANG_ObjC:
+  case dwarf::DW_LNAME_ObjC:
     return SourceLanguage::ObjC;
-  case dwarf::DW_LANG_ObjC_plus_plus:
+  case dwarf::DW_LNAME_ObjC_plus_plus:
     return SourceLanguage::ObjCpp;
   default:
     // There's no CodeView representation for this language, and CV doesn't
@@ -612,6 +602,14 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
   }
 }
 
+static SourceLanguage MapDWARFLanguageToCVLang(dwarf::SourceLanguage DWLang) {
+  auto MaybeLName = dwarf::toDW_LNAME(DWLang);
+  if (!MaybeLName)
+    return MapDWARFLanguageToCVLang(static_cast<dwarf::SourceLanguageName>(0));
+
+  return MapDWARFLanguageToCVLang(MaybeLName->first);
+}
+
 void CodeViewDebug::beginModule(Module *M) {
   // If COFF debug section is not available, skip any debug info related stuff.
   if (!Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
@@ -633,8 +631,13 @@ void CodeViewDebug::beginModule(Module *M) {
     Node = *CUs->operands().begin();
   }
   const auto *CU = cast<DICompileUnit>(Node);
+  DISourceLanguageName Lang = CU->getSourceLanguage();
   CurrentSourceLanguage =
-      MapDWLangToCVLang(CU->getSourceLanguage().getUnversionedName());
+      Lang.hasVersionedName()
+          ? MapDWARFLanguageToCVLang(
+                static_cast<dwarf::SourceLanguageName>(Lang.getName()))
+          : MapDWARFLanguageToCVLang(
+                static_cast<dwarf::SourceLanguage>(Lang.getName()));
   if (!M->getCodeViewFlag() ||
       CU->getEmissionKind() == DICompileUnit::NoDebug) {
     Asm = nullptr;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index d751a7f..433877f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1039,8 +1039,12 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
   } else
     NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
 
-  NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
-                DIUnit->getSourceLanguage().getUnversionedName());
+  if (auto Lang = DIUnit->getSourceLanguage(); Lang.hasVersionedName())
+    NewCU.addUInt(Die, dwarf::DW_AT_language_name, dwarf::DW_FORM_data2,
+                  Lang.getName());
+  else
+    NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                  Lang.getName());
 
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
   StringRef SysRoot = DIUnit->getSysRoot();
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index d5153b7..cdcb29d9 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1209,7 +1209,7 @@ MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB,
                                          MIE = MBB.instr_begin();
        MII != MIE; --MII) {
     const MachineInstr &MI = *std::prev(MII);
-    if (MI.isDebugInstr() || MI.isPseudoProbe())
+    if (MI.isDebugOrPseudoInstr())
       continue;
     RegisterOperands RegOpers;
     RegOpers.collect(MI, *TRI, *MRI, false, false);
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index 5f37890..7d4674b 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -858,7 +858,7 @@ void RegPressureTracker::recedeSkipDebugValues() {
 
 void RegPressureTracker::recede(SmallVectorImpl<VRegMaskOrUnit> *LiveUses) {
   recedeSkipDebugValues();
-  if (CurrPos->isDebugInstr() || CurrPos->isPseudoProbe()) {
+  if (CurrPos->isDebugOrPseudoInstr()) {
     // It's possible to only have debug_value and pseudo probe instructions and
     // hit the start of the block.
     assert(CurrPos == MBB->begin());
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5ffdc4e..b23b190 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10628,7 +10628,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     // folding this will increase the total number of instructions.
     if (N0.getOpcode() == ISD::SRL &&
         (N0.getOperand(1) == N1 || N0.hasOneUse()) &&
-        TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+        TLI.shouldFoldConstantShiftPairToMask(N)) {
       if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
                                     /*AllowUndefs*/ false,
                                     /*AllowTypeMismatch*/ true)) {
@@ -11207,7 +11207,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
     //                               (and (srl x, (sub c2, c1), MASK)
     if ((N0.getOperand(1) == N1 || N0->hasOneUse()) &&
-        TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+        TLI.shouldFoldConstantShiftPairToMask(N)) {
       auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
                                              ConstantSDNode *RHS) {
         const APInt &LHSC = LHS->getAPIntValue();
@@ -18870,27 +18870,38 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
 
 static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
-  // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
-  // replacing casts with a libcall. We also must be allowed to ignore -0.0
-  // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
-  // conversions would return +0.0.
+  // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
+  // If NoSignedZerosFPMath is enabled, this is a direct replacement.
+  // Otherwise, for strict math, we must handle edge cases:
+  // 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
+  // as example, it first becomes integer 0, and is converted back to +0.0.
+  // FTRUNC on its own could produce -0.0.
+
   // FIXME: We should be able to use node-level FMF here.
-  // TODO: If strict math, should we use FABS (+ range check for signed cast)?
   EVT VT = N->getValueType(0);
-  if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
-      !DAG.getTarget().Options.NoSignedZerosFPMath)
+  if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
     return SDValue();
 
   // fptosi/fptoui round towards zero, so converting from FP to integer and
   // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
   SDValue N0 = N->getOperand(0);
   if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
-      N0.getOperand(0).getValueType() == VT)
-    return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+      N0.getOperand(0).getValueType() == VT) {
+    if (DAG.getTarget().Options.NoSignedZerosFPMath)
+      return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+  }
 
   if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
-      N0.getOperand(0).getValueType() == VT)
-    return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+      N0.getOperand(0).getValueType() == VT) {
+    if (DAG.getTarget().Options.NoSignedZerosFPMath)
+      return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+
+    // Strict math: use FABS to handle negative inputs correctly.
+    if (TLI.isFAbsFree(VT)) {
+      SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
+      return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
+    }
+  }
 
   return SDValue();
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index b5f8a61..437d0f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3313,7 +3313,6 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_ROUND:   R = SoftPromoteHalfRes_FP_ROUND(N); break;
 
   // Unary FP Operations
-  case ISD::FABS:
   case ISD::FACOS:
   case ISD::FASIN:
   case ISD::FATAN:
@@ -3329,7 +3328,6 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FLOG2:
   case ISD::FLOG10:
   case ISD::FNEARBYINT:
-  case ISD::FNEG:
   case ISD::FREEZE:
   case ISD::FRINT:
   case ISD::FROUND:
@@ -3341,6 +3339,12 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FTAN:
   case ISD::FTANH:
   case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break;
+  case ISD::FABS:
+    R = SoftPromoteHalfRes_FABS(N);
+    break;
+  case ISD::FNEG:
+    R = SoftPromoteHalfRes_FNEG(N);
+    break;
   case ISD::AssertNoFPClass:
     R = SoftPromoteHalfRes_AssertNoFPClass(N);
     break;
@@ -3670,6 +3674,24 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) {
   return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FABS(SDNode *N) {
+  SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
+  SDLoc dl(N);
+
+  // Clear the sign bit.
+  return DAG.getNode(ISD::AND, dl, MVT::i16, Op,
+                     DAG.getConstant(0x7fff, dl, MVT::i16));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FNEG(SDNode *N) {
+  SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
+  SDLoc dl(N);
+
+  // Invert the sign bit.
+  return DAG.getNode(ISD::XOR, dl, MVT::i16, Op,
+                     DAG.getConstant(0x8000, dl, MVT::i16));
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfRes_AssertNoFPClass(SDNode *N) {
   return GetSoftPromotedHalf(N->getOperand(0));
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d580ce0..603dc34 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -832,6 +832,8 @@ private:
   SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
   SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
+  SDValue SoftPromoteHalfRes_FABS(SDNode *N);
+  SDValue SoftPromoteHalfRes_FNEG(SDNode *N);
   SDValue SoftPromoteHalfRes_AssertNoFPClass(SDNode *N);
   SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N);
   SDValue SoftPromoteHalfRes_UNDEF(SDNode *N);
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 0bc877d..2430d98 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -102,6 +102,10 @@ static cl::opt<bool> PrintProfData(
     "print-prof-data", cl::Hidden,
     cl::desc("Pretty print perf data (branch weights, etc) when dumping"));
 
+static cl::opt<bool> PreserveAssemblyUseListOrder(
+    "preserve-ll-uselistorder", cl::Hidden, cl::init(false),
+    cl::desc("Preserve use-list order when writing LLVM assembly."));
+
 // Make virtual table appear in this compilation unit.
 AssemblyAnnotationWriter::~AssemblyAnnotationWriter() = default;
 
@@ -2939,7 +2943,10 @@ AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
                                bool IsForDebug, bool ShouldPreserveUseListOrder)
     : Out(o), TheModule(M), Machine(Mac), TypePrinter(M), AnnotationWriter(AAW),
       IsForDebug(IsForDebug),
-      ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
+      ShouldPreserveUseListOrder(
+          PreserveAssemblyUseListOrder.getNumOccurrences()
+              ? PreserveAssemblyUseListOrder
+              : ShouldPreserveUseListOrder) {
   if (!TheModule)
     return;
   for (const GlobalObject &GO : TheModule->global_objects())
@@ -2950,7 +2957,8 @@ AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
 AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
                                const ModuleSummaryIndex *Index, bool IsForDebug)
     : Out(o), TheIndex(Index), Machine(Mac), TypePrinter(/*Module=*/nullptr),
-      IsForDebug(IsForDebug), ShouldPreserveUseListOrder(false) {}
+      IsForDebug(IsForDebug),
+      ShouldPreserveUseListOrder(PreserveAssemblyUseListOrder) {}
 
 void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
   if (!Operand) {
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index fba6942..070e833 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -326,6 +326,8 @@ std::optional<bool> ConstantFPRange::getSignBit() const {
 }
 
 bool ConstantFPRange::operator==(const ConstantFPRange &CR) const {
+  assert(&getSemantics() == &CR.getSemantics() &&
+         "Should only use the same semantics");
   if (MayBeSNaN != CR.MayBeSNaN || MayBeQNaN != CR.MayBeQNaN)
     return false;
   return Lower.bitwiseIsEqual(CR.Lower) && Upper.bitwiseIsEqual(CR.Upper);
@@ -411,3 +413,34 @@ ConstantFPRange ConstantFPRange::abs() const {
 ConstantFPRange ConstantFPRange::negate() const {
   return ConstantFPRange(-Upper, -Lower, MayBeQNaN, MayBeSNaN);
 }
+
+ConstantFPRange ConstantFPRange::getWithoutInf() const {
+  if (isNaNOnly())
+    return *this;
+  APFloat NewLower = Lower;
+  APFloat NewUpper = Upper;
+  if (Lower.isNegInfinity())
+    NewLower = APFloat::getLargest(getSemantics(), /*Negative=*/true);
+  if (Upper.isPosInfinity())
+    NewUpper = APFloat::getLargest(getSemantics(), /*Negative=*/false);
+  canonicalizeRange(NewLower, NewUpper);
+  return ConstantFPRange(std::move(NewLower), std::move(NewUpper), MayBeQNaN,
+                         MayBeSNaN);
+}
+
+ConstantFPRange ConstantFPRange::cast(const fltSemantics &DstSem,
+                                      APFloat::roundingMode RM) const {
+  bool LosesInfo;
+  APFloat NewLower = Lower;
+  APFloat NewUpper = Upper;
+  // For conservative, return full range if conversion is invalid.
+  if (NewLower.convert(DstSem, RM, &LosesInfo) == APFloat::opInvalidOp ||
+      NewLower.isNaN())
+    return getFull(DstSem);
+  if (NewUpper.convert(DstSem, RM, &LosesInfo) == APFloat::opInvalidOp ||
+      NewUpper.isNaN())
+    return getFull(DstSem);
+  return ConstantFPRange(std::move(NewLower), std::move(NewUpper),
+                         /*MayBeQNaNVal=*/MayBeQNaN || MayBeSNaN,
+                         /*MayBeSNaNVal=*/false);
+}
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index c5a2ec2..7a8c8ad 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -379,7 +379,7 @@ Expected<StringRef> XCOFFObjectFile::getSectionName(DataRefImpl Sec) const {
 }
 
 uint64_t XCOFFObjectFile::getSectionAddress(DataRefImpl Sec) const {
-  // Avoid ternary due to failure to convert the ubig32_t value to a unit64_t
+  // Avoid ternary due to failure to convert the ubig32_t value to a uint64_t
   // with MSVC.
   if (is64Bit())
     return toSection64(Sec)->VirtualAddress;
@@ -397,7 +397,7 @@ uint64_t XCOFFObjectFile::getSectionIndex(DataRefImpl Sec) const {
 }
 
 uint64_t XCOFFObjectFile::getSectionSize(DataRefImpl Sec) const {
-  // Avoid ternary due to failure to convert the ubig32_t value to a unit64_t
+  // Avoid ternary due to failure to convert the ubig32_t value to a uint64_t
   // with MSVC.
   if (is64Bit())
     return toSection64(Sec)->SectionSize;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7294f3e..fbce3b0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18640,7 +18640,7 @@ bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
 }
 
 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
-    const SDNode *N, CombineLevel Level) const {
+    const SDNode *N) const {
   assert(((N->getOpcode() == ISD::SHL &&
            N->getOperand(0).getOpcode() == ISD::SRL) ||
           (N->getOpcode() == ISD::SRL &&
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e472e7d..00956fd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -300,8 +300,7 @@ public:
   bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
 
   /// Return true if it is profitable to fold a pair of shifts into a mask.
-  bool shouldFoldConstantShiftPairToMask(const SDNode *N,
-                                         CombineLevel Level) const override;
+  bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
 
   /// Return true if it is profitable to fold a pair of shifts into a mask.
   bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 7392f4b..bfe2c80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -633,6 +633,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
       .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}});
 
+  addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
+      .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
+
   bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
   bool hasSMRDSmall = ST->hasScalarSubwordLoads();
   bool usesTrue16 = ST->useRealTrue16Insts();
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 4d3331a..c684f9e 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -674,15 +674,9 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
       createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
   MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
 
-  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
-    Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
-    Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
-  }
-  if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
-    Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
-    Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
-  }
-
+  uint32_t IFlags = I.getFlags();
+  Op0LOp1L->setFlags(IFlags);
+  Op0HOp1H->setFlags(IFlags);
   LoDstOp.setIsRenamable(DstOp.isRenamable());
   HiDstOp.setIsRenamable(DstOp.isRenamable());
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2a40fb9..67ea2dd 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -42,7 +42,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ComplexDeinterleavingPass.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -13817,7 +13816,7 @@ bool ARMTargetLowering::isDesirableToCommuteXorWithShift(
 }
 
 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
-    const SDNode *N, CombineLevel Level) const {
+    const SDNode *N) const {
   assert(((N->getOpcode() == ISD::SHL &&
            N->getOperand(0).getOpcode() == ISD::SRL) ||
           (N->getOpcode() == ISD::SRL &&
@@ -13827,7 +13826,8 @@ bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
   if (!Subtarget->isThumb1Only())
     return true;
 
-  if (Level == BeforeLegalizeTypes)
+  EVT VT = N->getValueType(0);
+  if (VT.getScalarSizeInBits() > 32)
     return true;
 
   return false;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 26ff54c..70aa001 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -772,8 +772,7 @@ class VectorType;
 
     bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
 
-    bool shouldFoldConstantShiftPairToMask(const SDNode *N,
-                                           CombineLevel Level) const override;
+    bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
 
     /// Return true if it is profitable to fold a pair of shifts into a mask.
     bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override {
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index b05de49..7f1ff45 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -1306,7 +1306,7 @@ bool MipsTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
 }
 
 bool MipsTargetLowering::shouldFoldConstantShiftPairToMask(
-    const SDNode *N, CombineLevel Level) const {
+    const SDNode *N) const {
   assert(((N->getOpcode() == ISD::SHL &&
            N->getOperand(0).getOpcode() == ISD::SRL) ||
           (N->getOpcode() == ISD::SRL &&
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index c65c76c..25a0bf9 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -290,8 +290,7 @@ class TargetRegisterClass;
     bool isCheapToSpeculateCttz(Type *Ty) const override;
     bool isCheapToSpeculateCtlz(Type *Ty) const override;
     bool hasBitTest(SDValue X, SDValue Y) const override;
-    bool shouldFoldConstantShiftPairToMask(const SDNode *N,
-                                           CombineLevel Level) const override;
+    bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
 
     /// Return the register type for a given MVT, ensuring vectors are treated
     /// as a series of gpr sized integers.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f692180..944a1e2 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -585,6 +585,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // We cannot sextinreg(i1).  Expand to shifts.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
+  // Custom handling for PowerPC ucmp instruction
+  setOperationAction(ISD::UCMP, MVT::i32, Custom);
+  setOperationAction(ISD::UCMP, MVT::i64, isPPC64 ? Custom : Expand);
+
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
   // support continuation, user-level threading, and etc.. As a result, no
@@ -12618,6 +12622,33 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({Sub, OverflowTrunc}, dl);
 }
 
+// Lower unsigned 3-way compare producing -1/0/1.
+SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue A = DAG.getFreeze(Op.getOperand(0));
+  SDValue B = DAG.getFreeze(Op.getOperand(1));
+  EVT OpVT = A.getValueType();   // operand type
+  EVT ResVT = Op.getValueType(); // result type
+
+  // First compute diff = A - B (will become subf).
+  SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, A, B);
+
+  // Generate B - A using SUBC to capture carry.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  SDValue SubC = DAG.getNode(PPCISD::SUBC, DL, VTs, B, A);
+  SDValue CA0 = SubC.getValue(1);
+
+  // t2 = A - B + CA0 using SUBE.
+  SDValue SubE1 = DAG.getNode(PPCISD::SUBE, DL, VTs, A, B, CA0);
+  SDValue CA1 = SubE1.getValue(1);
+
+  // res = diff - t2 + CA1 using SUBE (produces desired -1/0/1).
+  SDValue ResPair = DAG.getNode(PPCISD::SUBE, DL, VTs, Diff, SubE1, CA1);
+
+  // Extract the first result and truncate to result type if needed
+  return DAG.getSExtOrTrunc(ResPair.getValue(0), DL, ResVT);
+}
+
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -12722,6 +12753,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UADDO_CARRY:
   case ISD::USUBO_CARRY:
     return LowerADDSUBO_CARRY(Op, DAG);
+  case ISD::UCMP:
+    return LowerUCMP(Op, DAG);
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 6694305..59f3387 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1318,6 +1318,7 @@ namespace llvm {
     SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUCMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerToLibCall(const char *LibCallName, SDValue Op,
                            SelectionDAG &DAG) const;
     SDValue lowerLibCallBasedOnType(const char *LibCallFloatName,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 7a14929..b9e01c3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1653,17 +1653,18 @@ def riscv_selectcc_frag : PatFrag<(ops node:$lhs, node:$rhs, node:$cc,
                                                   node:$falsev), [{}],
                                   IntCCtoRISCVCC>;
 
-multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt> {
+multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt,
+                              ValueType cmpvt = XLenVT> {
   let usesCustomInserter = 1 in
   def _Using_CC_GPR : Pseudo<(outs valty:$dst),
                              (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
                               valty:$truev, valty:$falsev),
                              [(set valty:$dst,
-                               (riscv_selectcc_frag:$cc (XLenVT GPR:$lhs), GPR:$rhs, cond,
+                               (riscv_selectcc_frag:$cc (cmpvt GPR:$lhs), GPR:$rhs, cond,
                                                         (vt valty:$truev), valty:$falsev))]>;
   // Explicitly select 0 in the condition to X0. The register coalescer doesn't
   // always do it.
-  def : Pat<(riscv_selectcc_frag:$cc (XLenVT GPR:$lhs), 0, cond, (vt valty:$truev),
+  def : Pat<(riscv_selectcc_frag:$cc (cmpvt GPR:$lhs), 0, cond, (vt valty:$truev),
                                      valty:$falsev),
             (!cast<Instruction>(NAME#"_Using_CC_GPR") GPR:$lhs, (XLenVT X0),
              (IntCCtoRISCVCC $cc), valty:$truev, valty:$falsev)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index b9510ef..65e7e3b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -59,9 +59,9 @@ def FPR64IN32X : RegisterOperand<GPRPair> {
 def DExt       : ExtInfo<"", "", [HasStdExtD], f64, FPR64, FPR32, FPR64, ?>;
 
 def ZdinxExt   : ExtInfo<"_INX", "Zfinx", [HasStdExtZdinx, IsRV64],
-                         f64, FPR64INX, FPR32INX, FPR64INX, ?>;
+                         f64, FPR64INX, FPR32INX, FPR64INX, ?, i64>;
 def Zdinx32Ext : ExtInfo<"_IN32X", "ZdinxRV32Only", [HasStdExtZdinx, IsRV32],
-                         f64, FPR64IN32X, FPR32INX, FPR64IN32X, ?>;
+                         f64, FPR64IN32X, FPR32INX, FPR64IN32X, ?, i32>;
 
 defvar DExts     = [DExt, ZdinxExt, Zdinx32Ext];
 defvar DExtsRV64 = [DExt, ZdinxExt];
@@ -261,8 +261,10 @@ let Predicates = [HasStdExtZdinx, IsRV32] in {
 /// Float conversion operations
 
 // f64 -> f32, f32 -> f64
-def : Pat<(any_fpround FPR64IN32X:$rs1), (FCVT_S_D_IN32X FPR64IN32X:$rs1, FRM_DYN)>;
-def : Pat<(any_fpextend FPR32INX:$rs1), (FCVT_D_S_IN32X FPR32INX:$rs1, FRM_RNE)>;
+def : Pat<(any_fpround FPR64IN32X:$rs1),
+          (FCVT_S_D_IN32X FPR64IN32X:$rs1, (i32 FRM_DYN))>;
+def : Pat<(any_fpextend FPR32INX:$rs1),
+          (FCVT_D_S_IN32X FPR32INX:$rs1, (i32 FRM_RNE))>;
 } // Predicates = [HasStdExtZdinx, IsRV32]
 
 // [u]int<->double conversion patterns must be gated on IsRV32 or IsRV64, so
@@ -321,7 +323,7 @@ def : Pat<(any_fsqrt FPR64INX:$rs1), (FSQRT_D_INX FPR64INX:$rs1, FRM_DYN)>;
 def : Pat<(fneg FPR64INX:$rs1), (FSGNJN_D_INX $rs1, $rs1)>;
 def : Pat<(fabs FPR64INX:$rs1), (FSGNJX_D_INX $rs1, $rs1)>;
 
-def : Pat<(riscv_fclass FPR64INX:$rs1), (FCLASS_D_INX $rs1)>;
+def : Pat<(i64 (riscv_fclass FPR64INX:$rs1)), (FCLASS_D_INX $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D_INX, FPR64INX, f64>;
 def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_INX, FPR64INX, f64>;
@@ -354,41 +356,46 @@ def : Pat<(fneg (any_fma_nsz FPR64INX:$rs1, FPR64INX:$rs2, FPR64INX:$rs3)),
 } // Predicates = [HasStdExtZdinx, IsRV64]
 
 let Predicates = [HasStdExtZdinx, IsRV32] in {
-def : Pat<(any_fsqrt FPR64IN32X:$rs1), (FSQRT_D_IN32X FPR64IN32X:$rs1, FRM_DYN)>;
+def : Pat<(any_fsqrt FPR64IN32X:$rs1),
+          (FSQRT_D_IN32X FPR64IN32X:$rs1, (i32 FRM_DYN))>;
 
 def : Pat<(fneg FPR64IN32X:$rs1), (FSGNJN_D_IN32X $rs1, $rs1)>;
 def : Pat<(fabs FPR64IN32X:$rs1), (FSGNJX_D_IN32X $rs1, $rs1)>;
 
-def : Pat<(riscv_fclass FPR64IN32X:$rs1), (FCLASS_D_IN32X $rs1)>;
+def : Pat<(i32 (riscv_fclass FPR64IN32X:$rs1)), (FCLASS_D_IN32X $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D_IN32X, FPR64IN32X, f64>;
 def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_IN32X, FPR64IN32X, f64>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, (fneg FPR64IN32X:$rs2)),
           (FSGNJN_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2)>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, FPR32INX:$rs2),
-          (FSGNJ_D_IN32X $rs1, (FCVT_D_S_IN32X $rs2, FRM_RNE))>;
+          (FSGNJ_D_IN32X $rs1, (FCVT_D_S_IN32X $rs2, (i32 FRM_RNE)))>;
 def : Pat<(fcopysign FPR32INX:$rs1, FPR64IN32X:$rs2),
-          (FSGNJ_S_INX $rs1, (FCVT_S_D_IN32X $rs2, FRM_DYN))>;
+          (FSGNJ_S_INX $rs1, (FCVT_S_D_IN32X $rs2, (i32 FRM_DYN)))>;
 
 // fmadd: rs1 * rs2 + rs3
 def : Pat<(any_fma FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3),
-          (FMADD_D_IN32X $rs1, $rs2, $rs3, FRM_DYN)>;
+          (FMADD_D_IN32X $rs1, $rs2, $rs3, (i32 FRM_DYN))>;
 
 // fmsub: rs1 * rs2 - rs3
 def : Pat<(any_fma FPR64IN32X:$rs1, FPR64IN32X:$rs2, (fneg FPR64IN32X:$rs3)),
-          (FMSUB_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3, FRM_DYN)>;
+          (FMSUB_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3,
+                         (i32 FRM_DYN))>;
 
 // fnmsub: -rs1 * rs2 + rs3
 def : Pat<(any_fma (fneg FPR64IN32X:$rs1), FPR64IN32X:$rs2, FPR64IN32X:$rs3),
-          (FNMSUB_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3, FRM_DYN)>;
+          (FNMSUB_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3,
+                          (i32 FRM_DYN))>;
 
 // fnmadd: -rs1 * rs2 - rs3
 def : Pat<(any_fma (fneg FPR64IN32X:$rs1), FPR64IN32X:$rs2, (fneg FPR64IN32X:$rs3)),
-          (FNMADD_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3, FRM_DYN)>;
+          (FNMADD_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3,
+                          (i32 FRM_DYN))>;
 
 // fnmadd: -(rs1 * rs2 + rs3) (the nsz flag on the FMA)
 def : Pat<(fneg (any_fma_nsz FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3)),
-          (FNMADD_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3, FRM_DYN)>;
+          (FNMADD_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2, FPR64IN32X:$rs3,
+                          (i32 FRM_DYN))>;
 } // Predicates = [HasStdExtZdinx, IsRV32]
 
 // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
@@ -441,42 +448,42 @@ def : PatSetCC<FPR64, any_fsetccs, SETOLE, FLE_D, f64>;
 
 let Predicates = [HasStdExtZdinx, IsRV64] in {
 // Match signaling FEQ_D
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETEQ)),
+def : Pat<(XLenVT (strict_fsetccs FPR64INX:$rs1, FPR64INX:$rs2, SETEQ)),
           (AND (XLenVT (FLE_D_INX $rs1, $rs2)),
                (XLenVT (FLE_D_INX $rs2, $rs1)))>;
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETOEQ)),
+def : Pat<(XLenVT (strict_fsetccs FPR64INX:$rs1, FPR64INX:$rs2, SETOEQ)),
           (AND (XLenVT (FLE_D_INX $rs1, $rs2)),
                (XLenVT (FLE_D_INX $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs1, SETEQ)),
+def : Pat<(XLenVT (strict_fsetccs FPR64INX:$rs1, FPR64INX:$rs1, SETEQ)),
           (FLE_D_INX $rs1, $rs1)>;
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs1, SETOEQ)),
+def : Pat<(XLenVT (strict_fsetccs FPR64INX:$rs1, FPR64INX:$rs1, SETOEQ)),
           (FLE_D_INX $rs1, $rs1)>;
 
-def : PatSetCC<FPR64INX, any_fsetccs, SETLT,  FLT_D_INX, f64>;
-def : PatSetCC<FPR64INX, any_fsetccs, SETOLT, FLT_D_INX, f64>;
-def : PatSetCC<FPR64INX, any_fsetccs, SETLE,  FLE_D_INX, f64>;
-def : PatSetCC<FPR64INX, any_fsetccs, SETOLE, FLE_D_INX, f64>;
+def : PatSetCC<FPR64INX, any_fsetccs, SETLT,  FLT_D_INX, f64, i64>;
+def : PatSetCC<FPR64INX, any_fsetccs, SETOLT, FLT_D_INX, f64, i64>;
+def : PatSetCC<FPR64INX, any_fsetccs, SETLE,  FLE_D_INX, f64, i64>;
+def : PatSetCC<FPR64INX, any_fsetccs, SETOLE, FLE_D_INX, f64, i64>;
 } // Predicates = [HasStdExtZdinx, IsRV64]
 
 let Predicates = [HasStdExtZdinx, IsRV32] in {
 // Match signaling FEQ_D
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs2, SETEQ)),
+def : Pat<(i32 (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETEQ)),
           (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
                (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs2, SETOEQ)),
+def : Pat<(i32 (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETOEQ)),
           (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
                (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs1, SETEQ)),
+def : Pat<(i32 (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs1, SETEQ)),
           (FLE_D_IN32X $rs1, $rs1)>;
-def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs1, SETOEQ)),
+def : Pat<(i32 (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs1, SETOEQ)),
           (FLE_D_IN32X $rs1, $rs1)>;
 
-def : PatSetCC<FPR64IN32X, any_fsetccs, SETLT,  FLT_D_IN32X, f64>;
-def : PatSetCC<FPR64IN32X, any_fsetccs, SETOLT, FLT_D_IN32X, f64>;
-def : PatSetCC<FPR64IN32X, any_fsetccs, SETLE,  FLE_D_IN32X, f64>;
-def : PatSetCC<FPR64IN32X, any_fsetccs, SETOLE, FLE_D_IN32X, f64>;
+def : PatSetCC<FPR64IN32X, any_fsetccs, SETLT,  FLT_D_IN32X, f64, i32>;
+def : PatSetCC<FPR64IN32X, any_fsetccs, SETOLT, FLT_D_IN32X, f64, i32>;
+def : PatSetCC<FPR64IN32X, any_fsetccs, SETLE,  FLE_D_IN32X, f64, i32>;
+def : PatSetCC<FPR64IN32X, any_fsetccs, SETOLE, FLE_D_IN32X, f64, i32>;
 } // Predicates = [HasStdExtZdinx, IsRV32]
 
 let Predicates = [HasStdExtD] in {
@@ -511,7 +518,7 @@ def SplitF64Pseudo
 } // Predicates = [HasStdExtD, NoStdExtZfa, IsRV32]
 
 let Predicates = [HasStdExtZdinx, IsRV64] in {
-defm Select_FPR64INX : SelectCC_GPR_rrirr<FPR64INX, f64>;
+defm Select_FPR64INX : SelectCC_GPR_rrirr<FPR64INX, f64, i64>;
 
 def PseudoFROUND_D_INX : PseudoFROUND<FPR64INX, f64>;
 
@@ -523,9 +530,9 @@ def : StPat<store, SD, GPR, f64>;
 } // Predicates = [HasStdExtZdinx, IsRV64]
 
 let Predicates = [HasStdExtZdinx, IsRV32] in {
-defm Select_FPR64IN32X : SelectCC_GPR_rrirr<FPR64IN32X, f64>;
+defm Select_FPR64IN32X : SelectCC_GPR_rrirr<FPR64IN32X, f64, i32>;
 
-def PseudoFROUND_D_IN32X : PseudoFROUND<FPR64IN32X, f64>;
+def PseudoFROUND_D_IN32X : PseudoFROUND<FPR64IN32X, f64, i32>;
 
 /// Loads
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 1 in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index fde030e..6571d99 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -131,7 +131,7 @@ def FPR32INX : RegisterOperand<GPRF32> {
 // The DAGOperand can be unset if the predicates are not enough to define it.
 class ExtInfo<string suffix, string space, list<Predicate> predicates,
               ValueType primaryvt, DAGOperand primaryty, DAGOperand f32ty,
-              DAGOperand f64ty, DAGOperand f16ty> {
+              DAGOperand f64ty, DAGOperand f16ty, ValueType intvt = XLenVT> {
   list<Predicate> Predicates = predicates;
   string Suffix = suffix;
   string Space = space;
@@ -140,6 +140,7 @@ class ExtInfo<string suffix, string space, list<Predicate> predicates,
   DAGOperand F32Ty = f32ty;
   DAGOperand F64Ty = f64ty;
   ValueType PrimaryVT = primaryvt;
+  ValueType IntVT = intvt;
 }
 
 def FExt       : ExtInfo<"", "", [HasStdExtF], f32, FPR32, FPR32, ?, ?>;
@@ -314,9 +315,9 @@ multiclass FPCmp_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
   def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.PrimaryTy, Commutable>;
 }
 
-class PseudoFROUND<DAGOperand Ty, ValueType vt>
+class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT>
     : Pseudo<(outs Ty:$rd), (ins Ty:$rs1, Ty:$rs2, ixlenimm:$rm),
-      [(set Ty:$rd, (vt (riscv_fround Ty:$rs1, Ty:$rs2, timm:$rm)))]> {
+      [(set Ty:$rd, (vt (riscv_fround Ty:$rs1, Ty:$rs2, (intvt timm:$rm))))]> {
   let hasSideEffects = 0;
   let mayLoad = 0;
   let mayStore = 0;
@@ -529,13 +530,14 @@ def fpimm0    : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
 
 /// Generic pattern classes
 class PatSetCC<DAGOperand Ty, SDPatternOperator OpNode, CondCode Cond,
-               RVInstCommon Inst, ValueType vt>
-    : Pat<(XLenVT (OpNode (vt Ty:$rs1), Ty:$rs2, Cond)), (Inst $rs1, $rs2)>;
+               RVInstCommon Inst, ValueType vt, ValueType intvt = XLenVT>
+    : Pat<(intvt (OpNode (vt Ty:$rs1), Ty:$rs2, Cond)), (Inst $rs1, $rs2)>;
 multiclass PatSetCC_m<SDPatternOperator OpNode, CondCode Cond,
                       RVInstCommon Inst, ExtInfo Ext> {
   let Predicates = Ext.Predicates in
   def Ext.Suffix : PatSetCC<Ext.PrimaryTy, OpNode, Cond,
-                            !cast<RVInstCommon>(Inst#Ext.Suffix), Ext.PrimaryVT>;
+                            !cast<RVInstCommon>(Inst#Ext.Suffix),
+                            Ext.PrimaryVT, Ext.IntVT>;
 }
 
 class PatFprFpr<SDPatternOperator OpNode, RVInstR Inst,
@@ -549,14 +551,15 @@ multiclass PatFprFpr_m<SDPatternOperator OpNode, RVInstR Inst,
 }
 
 class PatFprFprDynFrm<SDPatternOperator OpNode, RVInstRFrm Inst,
-                      DAGOperand RegTy, ValueType vt>
-    : Pat<(OpNode (vt RegTy:$rs1), (vt RegTy:$rs2)), (Inst $rs1, $rs2, FRM_DYN)>;
+                      DAGOperand RegTy, ValueType vt, ValueType intvt>
+    : Pat<(OpNode (vt RegTy:$rs1), (vt RegTy:$rs2)),
+          (Inst $rs1, $rs2,(intvt FRM_DYN))>;
 multiclass PatFprFprDynFrm_m<SDPatternOperator OpNode, RVInstRFrm Inst,
                              ExtInfo Ext> {
   let Predicates = Ext.Predicates in
   def Ext.Suffix : PatFprFprDynFrm<OpNode,
                                    !cast<RVInstRFrm>(Inst#Ext.Suffix),
-                                   Ext.PrimaryTy, Ext.PrimaryVT>;
+                                   Ext.PrimaryTy, Ext.PrimaryVT, Ext.IntVT>;
 }
 
 /// Float conversion operations
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 014da99..52a2b29 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -69,16 +69,16 @@ def ZhinxminExt         : ExtInfo<"_INX", "Zfinx",
                                   f16, FPR16INX, FPR32INX, ?, FPR16INX>;
 def ZhinxZdinxExt       : ExtInfo<"_INX", "Zfinx",
                                   [HasStdExtZhinx, HasStdExtZdinx, IsRV64],
-                                  ?, ?, FPR32INX, FPR64INX, FPR16INX>;
+                                  ?, ?, FPR32INX, FPR64INX, FPR16INX, i64>;
 def ZhinxminZdinxExt    : ExtInfo<"_INX", "Zfinx",
                                   [HasStdExtZhinxmin, HasStdExtZdinx, IsRV64],
-                                  ?, ?, FPR32INX, FPR64INX, FPR16INX>;
+                                  ?, ?, FPR32INX, FPR64INX, FPR16INX, i64>;
 def ZhinxZdinx32Ext     : ExtInfo<"_IN32X", "ZdinxGPRPairRV32",
                                   [HasStdExtZhinx, HasStdExtZdinx, IsRV32],
-                                  ?, ?, FPR32INX, FPR64IN32X, FPR16INX>;
+                                  ?, ?, FPR32INX, FPR64IN32X, FPR16INX, i32>;
 def ZhinxminZdinx32Ext  : ExtInfo<"_IN32X", "ZdinxGPRPairRV32",
                                   [HasStdExtZhinxmin, HasStdExtZdinx, IsRV32],
-                                  ?, ?, FPR32INX, FPR64IN32X, FPR16INX>;
+                                  ?, ?, FPR32INX, FPR64IN32X, FPR16INX, i32>;
 
 defvar ZfhExts = [ZfhExt, ZhinxExt];
 defvar ZfhminExts = [ZfhminExt, ZhinxminExt];
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index 46afe03..eab7b21 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -36,6 +36,7 @@ add_llvm_target(SPIRVCodeGen
   SPIRVMetadata.cpp
   SPIRVModuleAnalysis.cpp
   SPIRVStructurizer.cpp
+  SPIRVCombinerHelper.cpp
   SPIRVPreLegalizer.cpp
   SPIRVPreLegalizerCombiner.cpp
   SPIRVPostLegalizer.cpp
diff --git a/llvm/lib/Target/SPIRV/SPIRVCombine.td b/llvm/lib/Target/SPIRV/SPIRVCombine.td
index 6f726e0..fde56c4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCombine.td
+++ b/llvm/lib/Target/SPIRV/SPIRVCombine.td
@@ -11,8 +11,8 @@ include "llvm/Target/GlobalISel/Combine.td"
 def vector_length_sub_to_distance_lowering : GICombineRule <
   (defs root:$root),
   (match (wip_match_opcode G_INTRINSIC):$root,
-          [{ return matchLengthToDistance(*${root}, MRI); }]),
-  (apply [{ applySPIRVDistance(*${root}, MRI, B); }])
+          [{ return Helper.matchLengthToDistance(*${root}); }]),
+  (apply [{ Helper.applySPIRVDistance(*${root}); }])
 >;
 
 def SPIRVPreLegalizerCombiner
diff --git a/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp
new file mode 100644
index 0000000..267794c
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp
@@ -0,0 +1,60 @@
+//===-- SPIRVCombinerHelper.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVCombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+SPIRVCombinerHelper::SPIRVCombinerHelper(
+    GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize,
+    GISelValueTracking *VT, MachineDominatorTree *MDT, const LegalizerInfo *LI,
+    const SPIRVSubtarget &STI)
+    : CombinerHelper(Observer, B, IsPreLegalize, VT, MDT, LI), STI(STI) {}
+
+/// This match is part of a combine that
+/// rewrites length(X - Y) to distance(X, Y)
+///   (f32 (g_intrinsic length
+///           (g_fsub (vXf32 X) (vXf32 Y))))
+/// ->
+///   (f32 (g_intrinsic distance
+///           (vXf32 X) (vXf32 Y)))
+///
+bool SPIRVCombinerHelper::matchLengthToDistance(MachineInstr &MI) const {
+  if (MI.getOpcode() != TargetOpcode::G_INTRINSIC ||
+      cast<GIntrinsic>(MI).getIntrinsicID() != Intrinsic::spv_length)
+    return false;
+
+  // First operand of MI is `G_INTRINSIC` so start at operand 2.
+  Register SubReg = MI.getOperand(2).getReg();
+  MachineInstr *SubInstr = MRI.getVRegDef(SubReg);
+  if (SubInstr->getOpcode() != TargetOpcode::G_FSUB)
+    return false;
+
+  return true;
+}
+
+void SPIRVCombinerHelper::applySPIRVDistance(MachineInstr &MI) const {
+  // Extract the operands for X and Y from the match criteria.
+  Register SubDestReg = MI.getOperand(2).getReg();
+  MachineInstr *SubInstr = MRI.getVRegDef(SubDestReg);
+  Register SubOperand1 = SubInstr->getOperand(1).getReg();
+  Register SubOperand2 = SubInstr->getOperand(2).getReg();
+  Register ResultReg = MI.getOperand(0).getReg();
+
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildIntrinsic(Intrinsic::spv_distance, ResultReg)
+      .addUse(SubOperand1)
+      .addUse(SubOperand2);
+
+  MI.eraseFromParent();
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h
new file mode 100644
index 0000000..0b39d34
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h
@@ -0,0 +1,38 @@
+//===-- SPIRVCombinerHelper.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This contains common combine transformations that may be used in a combine
+/// pass.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVCOMBINERHELPER_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVCOMBINERHELPER_H
+
+#include "SPIRVSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+
+namespace llvm {
+class SPIRVCombinerHelper : public CombinerHelper {
+protected:
+  const SPIRVSubtarget &STI;
+
+public:
+  using CombinerHelper::CombinerHelper;
+  SPIRVCombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
+                      bool IsPreLegalize, GISelValueTracking *VT,
+                      MachineDominatorTree *MDT, const LegalizerInfo *LI,
+                      const SPIRVSubtarget &STI);
+
+  bool matchLengthToDistance(MachineInstr &MI) const;
+  void applySPIRVDistance(MachineInstr &MI) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVCOMBINERHELPER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index e8c849e..28a1690 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -46,7 +46,6 @@
 #include "SPIRVSubtarget.h"
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 20f03b0..60d39c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp
index 8356751..48f4047 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp
@@ -1,4 +1,3 @@
-
 //===-- SPIRVPreLegalizerCombiner.cpp - combine legalization ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -13,24 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "SPIRV.h"
-#include "SPIRVTargetMachine.h"
+#include "SPIRVCombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
-#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
-#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/IntrinsicsSPIRV.h"
 
 #define GET_GICOMBINER_DEPS
 #include "SPIRVGenPreLegalizeGICombiner.inc"
@@ -47,72 +39,9 @@ namespace {
 #include "SPIRVGenPreLegalizeGICombiner.inc"
 #undef GET_GICOMBINER_TYPES
 
-/// This match is part of a combine that
-/// rewrites length(X - Y) to distance(X, Y)
-///   (f32 (g_intrinsic length
-///           (g_fsub (vXf32 X) (vXf32 Y))))
-/// ->
-///   (f32 (g_intrinsic distance
-///           (vXf32 X) (vXf32 Y)))
-///
-bool matchLengthToDistance(MachineInstr &MI, MachineRegisterInfo &MRI) {
-  if (MI.getOpcode() != TargetOpcode::G_INTRINSIC ||
-      cast<GIntrinsic>(MI).getIntrinsicID() != Intrinsic::spv_length)
-    return false;
-
-  // First operand of MI is `G_INTRINSIC` so start at operand 2.
-  Register SubReg = MI.getOperand(2).getReg();
-  MachineInstr *SubInstr = MRI.getVRegDef(SubReg);
-  if (!SubInstr || SubInstr->getOpcode() != TargetOpcode::G_FSUB)
-    return false;
-
-  return true;
-}
-void applySPIRVDistance(MachineInstr &MI, MachineRegisterInfo &MRI,
-                        MachineIRBuilder &B) {
-
-  // Extract the operands for X and Y from the match criteria.
-  Register SubDestReg = MI.getOperand(2).getReg();
-  MachineInstr *SubInstr = MRI.getVRegDef(SubDestReg);
-  Register SubOperand1 = SubInstr->getOperand(1).getReg();
-  Register SubOperand2 = SubInstr->getOperand(2).getReg();
-
-  // Remove the original `spv_length` instruction.
-
-  Register ResultReg = MI.getOperand(0).getReg();
-  DebugLoc DL = MI.getDebugLoc();
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineBasicBlock::iterator InsertPt = MI.getIterator();
-
-  // Build the `spv_distance` intrinsic.
-  MachineInstrBuilder NewInstr =
-      BuildMI(MBB, InsertPt, DL, B.getTII().get(TargetOpcode::G_INTRINSIC));
-  NewInstr
-      .addDef(ResultReg)                       // Result register
-      .addIntrinsicID(Intrinsic::spv_distance) // Intrinsic ID
-      .addUse(SubOperand1)                     // Operand X
-      .addUse(SubOperand2);                    // Operand Y
-
-  SPIRVGlobalRegistry *GR =
-      MI.getMF()->getSubtarget<SPIRVSubtarget>().getSPIRVGlobalRegistry();
-  auto RemoveAllUses = [&](Register Reg) {
-    SmallVector<MachineInstr *, 4> UsesToErase(
-        llvm::make_pointer_range(MRI.use_instructions(Reg)));
-
-    // calling eraseFromParent to early invalidates the iterator.
-    for (auto *MIToErase : UsesToErase) {
-      GR->invalidateMachineInstr(MIToErase);
-      MIToErase->eraseFromParent();
-    }
-  };
-  RemoveAllUses(SubDestReg);   // remove all uses of FSUB Result
-  GR->invalidateMachineInstr(SubInstr);
-  SubInstr->eraseFromParent(); // remove FSUB instruction
-}
-
 class SPIRVPreLegalizerCombinerImpl : public Combiner {
 protected:
-  const CombinerHelper Helper;
+  const SPIRVCombinerHelper Helper;
   const SPIRVPreLegalizerCombinerImplRuleConfig &RuleConfig;
   const SPIRVSubtarget &STI;
 
@@ -147,7 +76,7 @@ SPIRVPreLegalizerCombinerImpl::SPIRVPreLegalizerCombinerImpl(
     const SPIRVSubtarget &STI, MachineDominatorTree *MDT,
     const LegalizerInfo *LI)
     : Combiner(MF, CInfo, TPC, &VT, CSEInfo),
-      Helper(Observer, B, /*IsPreLegalize*/ true, &VT, MDT, LI),
+      Helper(Observer, B, /*IsPreLegalize*/ true, &VT, MDT, LI, STI),
       RuleConfig(RuleConfig), STI(STI),
 #define GET_GICOMBINER_CONSTRUCTOR_INITS
 #include "SPIRVGenPreLegalizeGICombiner.inc"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index 278ad7c..e621bcd44 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -14,7 +14,6 @@
 #include "SPIRV.h"
 #include "SPIRVSubtarget.h"
 #include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Transforms/Utils/Cloning.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 1811492..5b149f8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9580ade..eea84a2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -3634,7 +3633,7 @@ bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
 }
 
 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
-    const SDNode *N, CombineLevel Level) const {
+    const SDNode *N) const {
   assert(((N->getOpcode() == ISD::SHL &&
            N->getOperand(0).getOpcode() == ISD::SRL) ||
           (N->getOpcode() == ISD::SRL &&
@@ -3649,7 +3648,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
     // the fold for non-splats yet.
     return N->getOperand(1) == N->getOperand(0).getOperand(1);
   }
-  return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
+  return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N);
 }
 
 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b55556a..e28b9c1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1244,8 +1244,7 @@ namespace llvm {
     getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
                                   const Value *Rhs) const override;
 
-    bool shouldFoldConstantShiftPairToMask(const SDNode *N,
-                                           CombineLevel Level) const override;
+    bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
 
     bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
 
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index ac3626d..f021094 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -375,6 +375,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case MuslSF:
     return "muslsf";
   case MuslX32: return "muslx32";
+  case MuslWALI:
+    return "muslwali";
   case Simulator: return "simulator";
   case Pixel: return "pixel";
   case Vertex: return "vertex";
@@ -767,6 +769,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
       .StartsWith("muslf32", Triple::MuslF32)
       .StartsWith("muslsf", Triple::MuslSF)
       .StartsWith("muslx32", Triple::MuslX32)
+      .StartsWith("muslwali", Triple::MuslWALI)
       .StartsWith("musl", Triple::Musl)
       .StartsWith("msvc", Triple::MSVC)
       .StartsWith("itanium", Triple::Itanium)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 59e103cd..73ec451 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -880,11 +880,13 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
   // zext(bool) + C -> bool ? C + 1 : C
   if (match(Op0, m_ZExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return createSelectInst(X, InstCombiner::AddOne(Op1C), Op1);
+    return createSelectInstWithUnknownProfile(X, InstCombiner::AddOne(Op1C),
+                                              Op1);
   // sext(bool) + C -> bool ? C - 1 : C
   if (match(Op0, m_SExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return createSelectInst(X, InstCombiner::SubOne(Op1C), Op1);
+    return createSelectInstWithUnknownProfile(X, InstCombiner::SubOne(Op1C),
+                                              Op1);
 
   // ~X + C --> (C-1) - X
   if (match(Op0, m_Not(m_Value(X)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 218aaf9..7071876 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -471,18 +471,19 @@ private:
   Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
                                 unsigned Depth = 0);
 
-  SelectInst *createSelectInst(Value *C, Value *S1, Value *S2,
-                               const Twine &NameStr = "",
-                               InsertPosition InsertBefore = nullptr,
-                               Instruction *MDFrom = nullptr) {
-    SelectInst *SI =
-        SelectInst::Create(C, S1, S2, NameStr, InsertBefore, MDFrom);
-    if (!MDFrom)
-      setExplicitlyUnknownBranchWeightsIfProfiled(*SI, F, DEBUG_TYPE);
-    return SI;
+public:
+  /// Create `select C, S1, S2`. Use only when the profile cannot be calculated
+  /// from existing profile metadata: if the Function has profiles, this will
+  /// set the profile of this select to "unknown".
+  SelectInst *
+  createSelectInstWithUnknownProfile(Value *C, Value *S1, Value *S2,
+                                     const Twine &NameStr = "",
+                                     InsertPosition InsertBefore = nullptr) {
+    auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE);
+    return Sel;
   }
 
-public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
   void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8c8fc69..6b67b48 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -544,8 +544,18 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 
     Value *NewSel = Builder.CreateSelect(SI.getCondition(), Swapped ? C : OOp,
                                          Swapped ? OOp : C, "", &SI);
-    if (isa<FPMathOperator>(&SI))
-      cast<Instruction>(NewSel)->setFastMathFlags(FMF);
+    if (isa<FPMathOperator>(&SI)) {
+      FastMathFlags NewSelFMF = FMF;
+      // We cannot propagate ninf from the original select, because OOp may be
+      // inf and the flag only guarantees that FalseVal (op OOp) is never
+      // infinity.
+      // Examples: -inf + +inf = NaN, -inf - -inf = NaN, 0 * inf = NaN
+      // Specifically, if the original select has both ninf and nnan, we can
+      // safely propagate the flag.
+      NewSelFMF.setNoInfs(TVI->hasNoInfs() ||
+                          (NewSelFMF.noInfs() && NewSelFMF.noNaNs()));
+      cast<Instruction>(NewSel)->setFastMathFlags(NewSelFMF);
+    }
     NewSel->takeName(TVI);
     BinaryOperator *BO =
         BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index d457e0c..899a3c1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1253,7 +1253,8 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
     // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
     if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
       auto *NewC = Builder.CreateShl(ConstantInt::get(Ty, 1), C1);
-      return createSelectInst(X, NewC, ConstantInt::getNullValue(Ty));
+      return createSelectInstWithUnknownProfile(X, NewC,
+                                                ConstantInt::getNullValue(Ty));
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 127a506..63e24a0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 
@@ -107,7 +108,10 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I,
   Value *ShrAmtZ =
       IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()),
                               ShrAmt->getName() + ".z");
-  Value *Select = IC.Builder.CreateSelect(ShrAmtZ, Lower, Upper);
+  // There is no existing !prof metadata we can derive the !prof metadata for
+  // this select.
+  Value *Select = IC.createSelectInstWithUnknownProfile(ShrAmtZ, Lower, Upper);
+  IC.Builder.Insert(Select);
   Select->takeName(I);
   return Select;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index d56a1af..82ac903 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1737,7 +1737,7 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
   Constant *Zero = ConstantInt::getNullValue(BO.getType());
   Value *TVal = Builder.CreateBinOp(BO.getOpcode(), Ones, C);
   Value *FVal = Builder.CreateBinOp(BO.getOpcode(), Zero, C);
-  return createSelectInst(X, TVal, FVal);
+  return createSelectInstWithUnknownProfile(X, TVal, FVal);
 }
 
 static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index e5935f4..ff5f390 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -386,22 +386,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
   return OS;
 }
 
-/// Helper to get the successor corresponding to a particular case value for
-/// a switch statement.
-static BasicBlock *getNextCaseSuccessor(SwitchInst *Switch,
-                                        const APInt &NextState) {
-  BasicBlock *NextCase = nullptr;
-  for (auto Case : Switch->cases()) {
-    if (Case.getCaseValue()->getValue() == NextState) {
-      NextCase = Case.getCaseSuccessor();
-      break;
-    }
-  }
-  if (!NextCase)
-    NextCase = Switch->getDefaultDest();
-  return NextCase;
-}
-
 namespace {
 /// ThreadingPath is a path in the control flow of a loop that can be threaded
 /// by cloning necessary basic blocks and replacing conditional branches with
@@ -835,19 +819,32 @@ private:
     TPaths = std::move(TempList);
   }
 
+  /// Fast helper to get the successor corresponding to a particular case value
+  /// for a switch statement.
+  BasicBlock *getNextCaseSuccessor(const APInt &NextState) {
+    // Precompute the value => successor mapping
+    if (CaseValToDest.empty()) {
+      for (auto Case : Switch->cases()) {
+        APInt CaseVal = Case.getCaseValue()->getValue();
+        CaseValToDest[CaseVal] = Case.getCaseSuccessor();
+      }
+    }
+
+    auto SuccIt = CaseValToDest.find(NextState);
+    return SuccIt == CaseValToDest.end() ? Switch->getDefaultDest()
+                                         : SuccIt->second;
+  }
+
   // Two states are equivalent if they have the same switch destination.
   // Unify the states in different threading path if the states are equivalent.
   void unifyTPaths() {
-    llvm::SmallDenseMap<BasicBlock *, APInt> DestToState;
+    SmallDenseMap<BasicBlock *, APInt> DestToState;
     for (ThreadingPath &Path : TPaths) {
       APInt NextState = Path.getExitValue();
-      BasicBlock *Dest = getNextCaseSuccessor(Switch, NextState);
-      auto StateIt = DestToState.find(Dest);
-      if (StateIt == DestToState.end()) {
-        DestToState.insert({Dest, NextState});
+      BasicBlock *Dest = getNextCaseSuccessor(NextState);
+      auto [StateIt, Inserted] = DestToState.try_emplace(Dest, NextState);
+      if (Inserted)
         continue;
-      }
-
       if (NextState != StateIt->second) {
         LLVM_DEBUG(dbgs() << "Next state in " << Path << " is equivalent to "
                           << StateIt->second << "\n");
@@ -861,6 +858,7 @@ private:
   BasicBlock *SwitchBlock;
   OptimizationRemarkEmitter *ORE;
   std::vector<ThreadingPath> TPaths;
+  DenseMap<APInt, BasicBlock *> CaseValToDest;
   LoopInfo *LI;
   Loop *SwitchOuterLoop;
 };
@@ -1162,6 +1160,24 @@ private:
     SSAUpdate.RewriteAllUses(&DTU->getDomTree());
   }
 
+  /// Helper to get the successor corresponding to a particular case value for
+  /// a switch statement.
+  /// TODO: Unify it with SwitchPaths->getNextCaseSuccessor(SwitchInst *Switch)
+  /// by updating cached value => successor mapping during threading.
+  static BasicBlock *getNextCaseSuccessor(SwitchInst *Switch,
+                                          const APInt &NextState) {
+    BasicBlock *NextCase = nullptr;
+    for (auto Case : Switch->cases()) {
+      if (Case.getCaseValue()->getValue() == NextState) {
+        NextCase = Case.getCaseSuccessor();
+        break;
+      }
+    }
+    if (!NextCase)
+      NextCase = Switch->getDefaultDest();
+    return NextCase;
+  }
+
   /// Clones a basic block, and adds it to the CFG.
   ///
   /// This function also includes updating phi nodes in the successors of the
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 3a8ade8..42db424 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2156,6 +2156,9 @@ bool GVNPass::processLoad(LoadInst *L) {
   if (!L->isUnordered())
     return false;
 
+  if (L->getType()->isTokenLikeTy())
+    return false;
+
   if (L->use_empty()) {
     salvageAndRemoveInstruction(L);
     return true;
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 45d3d49..b9d332b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2961,6 +2961,7 @@ public:
         isa<FixedVectorType>(NewAI.getAllocatedType())
             ? cast<FixedVectorType>(NewAI.getAllocatedType())->getElementType()
             : Type::getInt8Ty(NewAI.getContext());
+    unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
 
     // Helper to check if a type is
     //  1. A fixed vector type
@@ -2991,10 +2992,17 @@ public:
         // Do not handle the case if
         //   1. The store does not meet the conditions in the helper function
         //   2. The store is volatile
+        //   3. The total store size is not a multiple of the allocated element
+        //   type size
         if (!IsTypeValidForTreeStructuredMerge(
                 SI->getValueOperand()->getType()) ||
             SI->isVolatile())
           return std::nullopt;
+        auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
+        unsigned NumElts = VecTy->getNumElements();
+        unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
+        if (NumElts * EltSize % AllocatedEltTySize != 0)
+          return std::nullopt;
         StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
                                 SI->getValueOperand());
       } else {
diff --git a/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index 3ae570c..4f1ff7b 100644
--- a/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -20,9 +20,8 @@
 
 using namespace llvm;
 
-namespace {
-void nameInstructions(Function &F) {
-  for (auto &Arg : F.args()) {
+static void nameInstructions(Function &F) {
+  for (Argument &Arg : F.args()) {
     if (!Arg.hasName())
       Arg.setName("arg");
   }
@@ -38,8 +37,6 @@ void nameInstructions(Function &F) {
   }
 }
 
-} // namespace
-
 PreservedAnalyses InstructionNamerPass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
   nameInstructions(F);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cfa8d27..2388375 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2245,6 +2245,26 @@ public:
                      Align Alignment, const int64_t Diff, Value *Ptr0,
                      Value *PtrN, StridedPtrInfo &SPtrInfo) const;
 
+  /// Return true if an array of scalar loads can be replaced with a strided
+  /// load (with run-time stride).
+  /// \param PointerOps list of pointer arguments of loads.
+  /// \param ScalarTy type of loads.
+  /// \param CommonAlignment common alignement of loads as computed by
+  /// `computeCommonAlignment<LoadInst>`.
+  /// \param SortedIndicies is a list of indicies computed by this function such
+  /// that the sequence `PointerOps[SortedIndices[0]],
+  /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
+  /// ordered by the coefficient of the stride. For example, if PointerOps is
+  /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
+  /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
+  /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
+  /// \param SPtrInfo If the function return `true`, it also sets all the fields
+  /// of `SPtrInfo` necessary to generate the strided load later.
+  bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
+                                Align CommonAlignment,
+                                SmallVectorImpl<unsigned> &SortedIndices,
+                                StridedPtrInfo &SPtrInfo) const;
+
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
   /// \param VL list of loads.
@@ -6875,6 +6895,24 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
   return false;
 }
 
+bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
+                                       Type *ScalarTy, Align CommonAlignment,
+                                       SmallVectorImpl<unsigned> &SortedIndices,
+                                       StridedPtrInfo &SPtrInfo) const {
+  const unsigned Sz = PointerOps.size();
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
+  if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+  if (const SCEV *Stride =
+          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
+    SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
+    SPtrInfo.StrideSCEV = Stride;
+    return true;
+  }
+  return false;
+}
+
 BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
     ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
     SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
@@ -6915,15 +6953,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   auto *VecTy = getWidenedType(ScalarTy, Sz);
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted) {
-    if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
-      if (const SCEV *Stride =
-              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
-          Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
-        SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
-        SPtrInfo.StrideSCEV = Stride;
-        return LoadsState::StridedVectorize;
-      }
-    }
+    if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
+                                 SPtrInfo))
+      return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
         TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
@@ -10632,7 +10664,9 @@ class InstructionsCompatibilityAnalysis {
   void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
     BasicBlock *Parent = nullptr;
     // Checks if the instruction has supported opcode.
-    auto IsSupportedInstruction = [&](Instruction *I) {
+    auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
+      if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
+        return false;
       return I && isSupportedOpcode(I->getOpcode()) &&
              (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
     };
@@ -10640,10 +10674,13 @@ class InstructionsCompatibilityAnalysis {
     // will be unable to schedule anyway.
     SmallDenseSet<Value *, 8> Operands;
     SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
+    bool AnyUndef = false;
     for (Value *V : VL) {
       auto *I = dyn_cast<Instruction>(V);
-      if (!I)
+      if (!I) {
+        AnyUndef |= isa<UndefValue>(V);
         continue;
+      }
       if (!DT.isReachableFromEntry(I->getParent()))
         continue;
       if (Candidates.empty()) {
@@ -10678,7 +10715,7 @@ class InstructionsCompatibilityAnalysis {
       if (P.second.size() < BestOpcodeNum)
         continue;
       for (Instruction *I : P.second) {
-        if (IsSupportedInstruction(I) && !Operands.contains(I)) {
+        if (IsSupportedInstruction(I, AnyUndef) && !Operands.contains(I)) {
           MainOp = I;
           BestOpcodeNum = P.second.size();
           break;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index f4ae66a..4ad5b38 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -70,6 +70,12 @@ define half @t3(half %x)  {
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; USE-NEON-NO-GPRS-LABEL: t3:
+; USE-NEON-NO-GPRS:       // %bb.0: // %entry
+; USE-NEON-NO-GPRS-NEXT:    fcvtzs h0, h0
+; USE-NEON-NO-GPRS-NEXT:    scvtf h0, h0
+; USE-NEON-NO-GPRS-NEXT:    ret
+;
 ; NONEON-NOSVE-LABEL: t3:
 ; NONEON-NOSVE:       // %bb.0: // %entry
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
@@ -147,6 +153,12 @@ define half @t6(half %x)  {
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; USE-NEON-NO-GPRS-LABEL: t6:
+; USE-NEON-NO-GPRS:       // %bb.0: // %entry
+; USE-NEON-NO-GPRS-NEXT:    fcvtzu h0, h0
+; USE-NEON-NO-GPRS-NEXT:    ucvtf h0, h0
+; USE-NEON-NO-GPRS-NEXT:    ret
+;
 ; NONEON-NOSVE-LABEL: t6:
 ; NONEON-NOSVE:       // %bb.0: // %entry
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
index c92e5c5..edb3607 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator -o - %s | FileCheck %s
 
 declare align(8) dereferenceable(8) ptr @declared_with_ret_deref() #0
 declare align(8) ptr @unknown_decl() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index e6a8bac..2356dad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -new-reg-bank-select -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -new-reg-bank-select -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX12 %s
 
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
 
 define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-LABEL: store_load_sindex_kernel:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 920d8fa..ae7f6ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 {
 ; GCN-LABEL: v_insert_v64i32_37:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
index cfbb429..aabf256 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
 
 ; GCN-LABEL: test_local_misaligned_v2:
 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index 66cdfc2..7b923f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -1,14 +1,14 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v4.ll
 ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v6.ll
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir
new file mode 100644
index 0000000..87836e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir
@@ -0,0 +1,170 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
+
+---
+name:            assert_sext_vgpr
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: assert_sext_vgpr
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT %copy, 4
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+    %copy:_(s32) = COPY $vgpr0
+    %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name:            assert_sext_sgpr
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8
+
+    ; CHECK-LABEL: name: assert_sext_sgpr
+    ; CHECK: liveins: $sgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:sgpr(s32) = COPY $sgpr8
+    ; CHECK-NEXT: %assert_sext:sgpr(s32) = G_ASSERT_SEXT %copy, 4
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+    %copy:_(s32) = COPY $sgpr8
+    %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name:            assert_sext_agpr
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $agpr0
+
+    ; CHECK-LABEL: name: assert_sext_agpr
+    ; CHECK: liveins: $agpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:vgpr(s32) = COPY $agpr0
+    ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT %copy, 4
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+    %copy:_(s32) = COPY $agpr0
+    %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name:            assert_sext_vgpr_regclass
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: assert_sext_vgpr_regclass
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:vgpr_32(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %copy(s32)
+    ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT [[COPY]], 4
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+    %copy:vgpr_32(s32) = COPY $vgpr0
+    %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name:            assert_sext_sgpr_regcllass
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8
+
+    ; CHECK-LABEL: name: assert_sext_sgpr_regcllass
+    ; CHECK: liveins: $sgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:sgpr_32(s32) = COPY $sgpr8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY %copy(s32)
+    ; CHECK-NEXT: %assert_sext:sgpr(s32) = G_ASSERT_SEXT [[COPY]], 4
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+    %copy:sgpr_32(s32) = COPY $sgpr8
+    %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name:            assert_sext_vgpr_64
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: assert_sext_vgpr_64
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+    ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_SEXT [[COPY]], 4
+    ; CHECK-NEXT: %assert_sext:vreg_64(s64) = COPY [[ASSERT_SEXT]](s64)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64)
+    %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+    %assert_sext:vreg_64(s64) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name:            assert_sext_sgpr_64
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: assert_sext_sgpr_64
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY %copy(s64)
+    ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:sgpr(s64) = G_ASSERT_SEXT [[COPY]], 4
+    ; CHECK-NEXT: %assert_sext:sreg_64(s64) = COPY [[ASSERT_SEXT]](s64)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64)
+    %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+    %assert_sext:sreg_64(s64) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name:            assert_sext_agpr_64
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $agpr0_agpr1
+
+    ; CHECK-LABEL: name: assert_sext_agpr_64
+    ; CHECK: liveins: $agpr0_agpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:areg_64(s64) = COPY $agpr0_agpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+    ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_SEXT [[COPY]], 4
+    ; CHECK-NEXT: %assert_sext:areg_64(s64) = COPY [[ASSERT_SEXT]](s64)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64)
+    %copy:areg_64(s64) = COPY $agpr0_agpr1
+    %assert_sext:areg_64(s64) = G_ASSERT_SEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_sext
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir
index 0bce908..c64a8ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
 
 ---
 name:            assert_zext_vgpr
@@ -53,8 +53,8 @@ body:             |
     ; CHECK-LABEL: name: assert_zext_agpr
     ; CHECK: liveins: $agpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %copy:agpr(s32) = COPY $agpr0
-    ; CHECK-NEXT: %assert_zext:agpr(s32) = G_ASSERT_ZEXT %copy, 4
+    ; CHECK-NEXT: %copy:vgpr(s32) = COPY $agpr0
+    ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT %copy, 4
     ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32)
     %copy:_(s32) = COPY $agpr0
     %assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4
@@ -74,7 +74,8 @@ body:             |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %copy:vgpr_32(s32) = COPY $vgpr0
-    ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT %copy, 4
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %copy(s32)
+    ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT [[COPY]], 4
     ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32)
     %copy:vgpr_32(s32) = COPY $vgpr0
     %assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4
@@ -94,9 +95,76 @@ body:             |
     ; CHECK: liveins: $sgpr8
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %copy:sgpr_32(s32) = COPY $sgpr8
-    ; CHECK-NEXT: %assert_zext:sgpr(s32) = G_ASSERT_ZEXT %copy, 4
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY %copy(s32)
+    ; CHECK-NEXT: %assert_zext:sgpr(s32) = G_ASSERT_ZEXT [[COPY]], 4
     ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32)
     %copy:sgpr_32(s32) = COPY $sgpr8
     %assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4
     S_ENDPGM 0, implicit %assert_zext
 ...
+
+---
+name:            assert_zext_vgpr_64
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: assert_zext_vgpr_64
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+    ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4
+    ; CHECK-NEXT: %assert_zext:vreg_64(s64) = COPY [[ASSERT_ZEXT]](s64)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64)
+    %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+    %assert_zext:vreg_64(s64) = G_ASSERT_ZEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_zext
+...
+
+---
+name:            assert_zext_sgpr_64
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: assert_zext_sgpr_64
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY %copy(s64)
+    ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:sgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4
+    ; CHECK-NEXT: %assert_zext:sreg_64(s64) = COPY [[ASSERT_ZEXT]](s64)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64)
+    %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+    %assert_zext:sreg_64(s64) = G_ASSERT_ZEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_zext
+...
+
+---
+name:            assert_zext_agpr_64
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $agpr0_agpr1
+
+    ; CHECK-LABEL: name: assert_zext_agpr_64
+    ; CHECK: liveins: $agpr0_agpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:areg_64(s64) = COPY $agpr0_agpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+    ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4
+    ; CHECK-NEXT: %assert_zext:areg_64(s64) = COPY [[ASSERT_ZEXT]](s64)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64)
+    %copy:areg_64(s64) = COPY $agpr0_agpr1
+    %assert_zext:areg_64(s64) = G_ASSERT_ZEXT %copy, 4
+    S_ENDPGM 0, implicit %assert_zext
+...
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4b14dc6..7ee0015f 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -21204,18 +21204,14 @@ define bfloat @v_fabs_bf16(bfloat %a) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fabs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fabs_bf16:
@@ -21440,10 +21436,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fneg_fabs_bf16:
@@ -21451,10 +21444,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fneg_fabs_bf16:
@@ -21510,23 +21500,17 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ; GCN-LABEL: s_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT:    s_bitset0_b32 s0, 31
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT:    s_xor_b32 s0, s0, 0x80000000
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: s_fneg_fabs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT:    s_bitset0_b32 s0, 31
-; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT:    s_xor_b32 s0, s0, 0x80000000
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fneg_fabs_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 5d184b1..c46fcde 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -218,19 +218,11 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s3, s3, 16
-; CI-NEXT:    s_and_b32 s5, s2, 0xffff0000
-; CI-NEXT:    v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT:    v_mul_f32_e64 v1, 1.0, |s3|
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s5|
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
+; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -537,16 +529,15 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_mul_f32_e64 v4, 1.0, |v3|
-; CI-NEXT:    v_mul_f32_e64 v5, 1.0, |v2|
-; CI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; CI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; CI-NEXT:    v_mul_f32_e32 v3, v4, v3
-; CI-NEXT:    v_mul_f32_e32 v2, v5, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; CI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_mul_f32_e32 v2, v2, v5
+; CI-NEXT:    v_mul_f32_e32 v3, v3, v4
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_alignbit_b32 v2, v2, v3, 16
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -898,16 +889,13 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 {
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v0, v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    v_mul_f32_e64 v1, 1.0, |v1|
-; CI-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_add_f32_e32 v0, 2.0, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_mul_f32_e32 v1, 4.0, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    flat_store_short v[0:1], v1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    flat_store_short v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 64a9727..76da0aa 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -107,12 +107,10 @@ define amdgpu_kernel void @fneg_fabs_fmul_bf16(ptr addrspace(1) %out, bfloat %x,
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s3, s2, 0x7fff
-; CI-NEXT:    s_lshl_b32 s3, s3, 16
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s3
+; CI-NEXT:    s_lshl_b32 s3, s2, 16
 ; CI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    v_mul_f32_e32 v0, s2, v0
+; CI-NEXT:    v_mov_b32_e32 v0, s3
+; CI-NEXT:    v_mul_f32_e64 v0, s2, -|v0|
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -204,12 +202,10 @@ define amdgpu_kernel void @fneg_fabs_free_bf16(ptr addrspace(1) %out, i16 %in) {
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_short v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -279,12 +275,10 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) {
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_short v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -345,43 +339,22 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) {
 }
 
 define amdgpu_kernel void @v_fneg_fabs_bf16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; CI-LABEL: v_fneg_fabs_bf16:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_load_ushort v2, v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |v2|
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT:    flat_store_short v[0:1], v2
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_fneg_fabs_bf16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_load_ushort v2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x8000, v2
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; CIVI-LABEL: v_fneg_fabs_bf16:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT:    s_add_i32 s12, s12, s17
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    v_mov_b32_e32 v0, s2
+; CIVI-NEXT:    v_mov_b32_e32 v1, s3
+; CIVI-NEXT:    flat_load_ushort v2, v[0:1]
+; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    v_or_b32_e32 v2, 0x8000, v2
+; CIVI-NEXT:    flat_store_short v[0:1], v2
+; CIVI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
@@ -431,21 +404,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_add_f32_e64 v0, s3, 2.0
-; CI-NEXT:    v_add_f32_e64 v1, s2, 1.0
-; CI-NEXT:    v_readfirstlane_b32 s2, v0
+; CI-NEXT:    s_lshl_b32 s3, s2, 16
 ; CI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT:    s_bitset0_b32 s2, 31
-; CI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v1
-; CI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    s_lshr_b32 s2, s2, 16
-; CI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT:    v_alignbit_b32 v2, s2, v0, 16
+; CI-NEXT:    v_add_f32_e64 v1, s2, 2.0
+; CI-NEXT:    v_add_f32_e64 v0, s3, 1.0
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; CI-NEXT:    v_or_b32_e32 v2, 0x80008000, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -566,15 +531,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s3, s2, 0x7fff
-; CI-NEXT:    s_and_b32 s2, s2, 0x7fff0000
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT:    s_lshl_b32 s2, s3, 16
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT:    v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -629,27 +589,11 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s4, s2, 16
-; CI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT:    v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT:    s_and_b32 s2, s3, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s5, s3, 16
-; CI-NEXT:    v_mul_f32_e64 v3, 1.0, |s2|
-; CI-NEXT:    v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT:    v_mul_f32_e64 v1, 1.0, |s5|
-; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; CI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; CI-NEXT:    s_or_b32 s3, s3, 0x80008000
+; CI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -860,21 +804,20 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    s_and_b32 s1, s4, 0x7fff
-; CI-NEXT:    s_and_b32 s2, s4, 0x7fff0000
-; CI-NEXT:    v_mul_f32_e64 v4, -1.0, s2
-; CI-NEXT:    s_lshl_b32 s1, s1, 16
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
-; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; CI-NEXT:    v_mul_f32_e64 v5, -1.0, s1
-; CI-NEXT:    v_alignbit_b32 v4, v4, v5, 16
-; CI-NEXT:    v_mov_b32_e32 v5, s0
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    s_or_b32 s2, s0, 0x8000
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s1, s4, 0x7fff0000
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff
+; CI-NEXT:    s_or_b32 s1, s1, s2
+; CI-NEXT:    s_bitset1_b32 s1, 31
+; CI-NEXT:    v_mov_b32_e32 v4, s0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    flat_store_dword v[0:1], v5
-; CI-NEXT:    flat_store_dword v[2:3], v4
+; CI-NEXT:    flat_store_dword v[0:1], v4
+; CI-NEXT:    v_mov_b32_e32 v0, s1
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fneg_multi_use_fabs_v2bf16:
@@ -1086,5 +1029,3 @@ declare <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat>) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CIVI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index d232693..98044a7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -14,11 +14,10 @@ define amdgpu_kernel void @s_fneg_bf16(ptr addrspace(1) %out, bfloat %in) #0 {
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_short v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -93,9 +92,7 @@ define amdgpu_kernel void @v_fneg_bf16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_ushort v2, v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_mul_f32_e32 v2, -1.0, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; CI-NEXT:    flat_store_short v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -170,11 +167,10 @@ define amdgpu_kernel void @s_fneg_free_bf16(ptr addrspace(1) %out, i16 %in) #0 {
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_short v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -248,9 +244,9 @@ define amdgpu_kernel void @v_fneg_fold_bf16(ptr addrspace(1) %out, ptr addrspace
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_xor_b32_e32 v3, 0x8000, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_mul_f32_e32 v3, -1.0, v2
-; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    flat_store_short v[0:1], v2
@@ -365,13 +361,13 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s3
-; CI-NEXT:    v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff
+; CI-NEXT:    s_or_b32 s2, s2, s3
+; CI-NEXT:    s_add_i32 s2, s2, 0x80000000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -426,16 +422,16 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
 ; CI-NEXT:    ; def s2
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s3
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT:    v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff
+; CI-NEXT:    s_or_b32 s2, s2, s3
+; CI-NEXT:    s_add_i32 s2, s2, 0x80000000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -501,13 +497,11 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v2, v[0:1]
+; CI-NEXT:    s_mov_b32 s0, 0xffff
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_mul_f32_e32 v3, -1.0, v3
-; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_mul_f32_e32 v2, -1.0, v2
-; CI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT:    v_xor_b32_e32 v3, 0x8000, v2
+; CI-NEXT:    v_bfi_b32 v2, s0, v3, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x80000000, v2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -570,13 +564,13 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_mul_f32_e64 v0, -1.0, s3
-; CI-NEXT:    v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff
+; CI-NEXT:    s_or_b32 s2, s2, s3
+; CI-NEXT:    s_add_i32 s2, s2, 0x80000000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -637,16 +631,14 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_mul_f32_e32 v4, -1.0, v3
-; CI-NEXT:    v_mul_f32_e32 v5, -1.0, v2
-; CI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; CI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; CI-NEXT:    v_mul_f32_e32 v3, v4, v3
-; CI-NEXT:    v_mul_f32_e32 v2, v5, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT:    v_xor_b32_e32 v3, 0x8000, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_mul_f32_e64 v2, -v2, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v3, v4
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_alignbit_b32 v2, v2, v3, 16
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -912,12 +904,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_load_dword v0, v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; CI-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; CI-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; CI-NEXT:    flat_store_short v[0:1], v0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    flat_store_short v[0:1], v1
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll
new file mode 100644
index 0000000..49204f8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) {
+; GFX6-LABEL: fptoui_f32_to_i16_to_f32:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_trunc_f32_e64 v0, |s6|
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f32_to_i16_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e64 v1, |s2|
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui float %x to i16
+  %fp = uitofp i16 %ui to float
+  store float %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) {
+; GFX6-LABEL: fptoui_f32_to_i32_to_f32:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_trunc_f32_e64 v0, |s6|
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f32_to_i32_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e64 v1, |s2|
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui float %x to i32
+  %fp = uitofp i32 %ui to float
+  store float %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) {
+; GFX6-LABEL: fptoui_f32_to_i64_to_f32:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_trunc_f32_e64 v0, |s6|
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f32_to_i64_to_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e64 v1, |s2|
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui float %x to i64
+  %fp = uitofp i64 %ui to float
+  store float %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) {
+; GFX6-LABEL: fptoui_f16_to_i16_to_f16:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xb
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, s0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f16_to_i16_to_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e64 v1, |s2|
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui half %x to i16
+  %fp = uitofp i16 %ui to half
+  store half %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) {
+; GFX6-LABEL: fptoui_f16_to_i32_to_f16:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xb
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f16_to_i32_to_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e64 v1, |s2|
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui half %x to i32
+  %fp = uitofp i32 %ui to half
+  store half %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) {
+; GFX6-LABEL: fptoui_f16_to_i64_to_f16:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xb
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f16_to_i64_to_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e64 v1, |s2|
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui half %x to i64
+  %fp = uitofp i64 %ui to half
+  store half %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) {
+; GFX6-LABEL: fptoui_f64_to_i16_to_f64:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_u32_f64_e32 v0, s[2:3]
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f64_to_i16_to_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e64 v[0:1], |s[2:3]|
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui double %x to i16
+  %fp = uitofp i16 %ui to double
+  store double %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) {
+; GFX6-LABEL: fptoui_f64_to_i32_to_f64:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_u32_f64_e32 v0, s[2:3]
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f64_to_i32_to_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e64 v[0:1], |s[2:3]|
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui double %x to i32
+  %fp = uitofp i32 %ui to double
+  store double %fp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) {
+; GFX6-LABEL: fptoui_f64_to_i64_to_f64:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s5, 0xfffff
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    v_not_b32_e32 v0, 31
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_bfe_u32 s7, s3, 0xb0014
+; GFX6-NEXT:    s_addk_i32 s7, 0xfc01
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX6-NEXT:    s_and_b32 s8, s3, 0x80000000
+; GFX6-NEXT:    s_andn2_b64 s[4:5], s[2:3], s[4:5]
+; GFX6-NEXT:    s_cmp_lt_i32 s7, 0
+; GFX6-NEXT:    s_cselect_b32 s4, 0, s4
+; GFX6-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX6-NEXT:    s_cmp_gt_i32 s7, 51
+; GFX6-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX6-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX6-NEXT:    v_ldexp_f64 v[0:1], s[2:3], v0
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX6-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 3
+; GFX6-NEXT:    s_mov_b32 s4, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3]
+; GFX6-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX6-NEXT:    v_cvt_u32_f64_e32 v2, v[2:3]
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX6-NEXT:    v_cvt_f64_u32_e32 v[2:3], v2
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fptoui_f64_to_i64_to_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e64 v[0:1], |s[2:3]|
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ui = fptoui double %x to i64
+  %fp = uitofp i64 %ui to double
+  store double %fp, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 67d0410..3324018 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -3,11 +3,11 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
 
 define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; SI-LABEL: is_private_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 63333ed..355d002 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -3,11 +3,11 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
 
 define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; CIT-LABEL: is_local_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
new file mode 100644
index 0000000..f53aaaa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -0,0 +1,625 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
+declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
+
+define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: shl32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_lshl_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = shl i32 %val0, %val1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: shl64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = shl i64 %val0, %val1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: lshr32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_lshr_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = lshr i32 %val0, %val1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: lshr64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = lshr i64 %val0, %val1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: ashr32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_ashr_i32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = ashr i32 %val0, %val1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: ashr64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = ashr i64 %val0, %val1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @abs32(i32 inreg %val0) {
+; CHECK-LABEL: abs32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_abs_i32 s0, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %neg = sub i32 0, %val0
+  %cond = icmp sgt i32 %val0, %neg
+  %result = select i1 %cond, i32 %val0, i32 %neg
+  call void asm "; use $0", "s"(i32 %result)
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: and32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = and i32 %val0, %val1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: and64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = and i64 %val0, %val1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: or32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_or_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = or i32 %val0, %val1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: or64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = or i64 %val0, %val1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: xor32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_xor_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = xor i32 %val0, %val1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: xor64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = xor i64 %val0, %val1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: nand32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_nand_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = and i32 %val0, %val1
+  %result2 = xor i32 %result, -1
+  call void asm "; use $0", "s"(i32 %result2)
+  %cmp = icmp ne i32 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: nand64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = and i64 %val0, %val1
+  %result2 = xor i64 %result, -1
+  call void asm "; use $0", "s"(i64 %result2)
+  %cmp = icmp ne i64 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: nor32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_nor_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = or i32 %val0, %val1
+  %result2 = xor i32 %result, -1
+  call void asm "; use $0", "s"(i32 %result2)
+  %cmp = icmp ne i32 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: nor64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_nor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = or i64 %val0, %val1
+  %result2 = xor i64 %result, -1
+  call void asm "; use $0", "s"(i64 %result2)
+  %cmp = icmp ne i64 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: xnor32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_xnor_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = xor i32 %val0, %val1
+  %result2 = xor i32 %result, -1
+  call void asm "; use $0", "s"(i32 %result2)
+  %cmp = icmp ne i32 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: xnor64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = xor i64 %val0, %val1
+  %result2 = xor i64 %result, -1
+  call void asm "; use $0", "s"(i64 %result2)
+  %cmp = icmp ne i64 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: andn232:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_andn2_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %nval1 = xor i32 %val1, -1
+  %result = and i32 %val0, %nval1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: nandn264:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %nval1 = xor i64 %val1, -1
+  %result = and i64 %val0, %nval1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: orn232:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_orn2_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %nval1 = xor i32 %val1, -1
+  %result = or i32 %val0, %nval1
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: orn264:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_orn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %nval1 = xor i64 %val1, -1
+  %result = or i64 %val0, %nval1
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
+; CHECK-LABEL: bfe_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_bfe_i32 s0, s0, 0x80010
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %shl = shl i32 %val0, 8
+  %result = ashr i32 %shl, 24
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_i64(i64 inreg %val0) {
+; CHECK-LABEL: bfe_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_bfe_i64 s[2:3], s[0:1], 0x80000
+; CHECK-NEXT:    s_and_b32 s0, s0, 0xff
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[2:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ; return to shader part epilog
+  %shl = shl i64 %val0, 56
+  %result = ashr i64 %shl, 56
+  call void asm "; use $0", "s"(i64 %result)
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
+; CHECK-LABEL: bfe_u32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %shl = shl i32 %val0, 8
+  %result = lshr i32 %shl, 24
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
+; CHECK-LABEL: bfe_u64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, s0, 0xff
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %shl = shl i64 %val0, 56
+  %result = lshr i64 %shl, 56
+  call void asm "; use $0", "s"(i64 %result)
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
+; CHECK-LABEL: bcnt032:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT:    s_sub_i32 s0, 32, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %result2 = sub i32 32, %result
+  call void asm "; use $0", "s"(i32 %result2)
+  %cmp = icmp ne i32 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
+; CHECK-LABEL: bcnt064:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT:    s_sub_u32 s0, 64, s0
+; CHECK-NEXT:    s_subb_u32 s1, 0, 0
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+  %result2 = sub i64 64, %result
+  call void asm "; use $0", "s"(i64 %result2)
+  %cmp = icmp ne i64 %result2, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
+; CHECK-LABEL: bcnt132:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  call void asm "; use $0", "s"(i32 %result)
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt164(i64 inreg %val0) {
+; CHECK-LABEL: bcnt164:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+  call void asm "; use $0", "s"(i64 %result)
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
+; CHECK-LABEL: quadmask32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_quadmask_b32 s0, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val0) nounwind readnone
+  call void asm "; use $0", "s"(i32 %result)
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
+; CHECK-LABEL: quadmask64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val0) nounwind readnone
+  call void asm "; use $0", "s"(i64 %result)
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @not32(i32 inreg %val0) {
+; CHECK-LABEL: not32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_not_b32 s0, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = xor i32 %val0, -1
+  call void asm "; use $0", "s"(i32 %result)
+  %cmp = icmp ne i32 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define amdgpu_ps i32 @not64(i64 inreg %val0) {
+; CHECK-LABEL: not64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_not_b64 s[0:1], s[0:1]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    ; return to shader part epilog
+  %result = xor i64 %val0, -1
+  call void asm "; use $0", "s"(i64 %result)
+  %cmp = icmp ne i64 %result, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 8b467eb..75ae76f 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -1,7 +1,7 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
 # RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX950 %s
 # RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX942 %s
-# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90a %s
+# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90A %s
 
 ---
 name:            test_pk_mul_unpacking_f32
@@ -57,26 +57,26 @@ body:             |
     ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_pk_mul_unpacking_f32
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_pk_mul_unpacking_f32
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     S_WAITCNT 49279
@@ -150,26 +150,26 @@ body:             |
     ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_op_sel_selection_unpacking_f32
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_op_sel_selection_unpacking_f32
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     S_WAITCNT 49279
@@ -243,26 +243,26 @@ body:             |
     ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_op_sel_hi_selection_unpacking_f32
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     S_WAITCNT 49279
@@ -370,41 +370,41 @@ body:             |
     ; GFX942-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_pk_add_unpacking_f32
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
-    ; GFX90a-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GFX90a-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
-    ; GFX90a-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_pk_add_unpacking_f32
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+    ; GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
     renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
@@ -438,7 +438,6 @@ body:             |
     renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     S_ENDPGM 0
 
-
 ...
 ---
 name:            test_pk_fma_unpacking_f32
@@ -490,24 +489,24 @@ body:             |
     ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_pk_fma_unpacking_f32
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_pk_fma_unpacking_f32
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -577,25 +576,25 @@ body:             |
     ; GFX942-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -670,28 +669,28 @@ body:             |
     ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -705,14 +704,11 @@ body:             |
     early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
     $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-
     $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
     $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
-
     $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
     $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
     $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
-
     S_ENDPGM 0
 
 ...
@@ -770,26 +766,26 @@ body:             |
     ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -861,26 +857,26 @@ body:             |
     ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
-    ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
-    ; GFX90a-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+    ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -950,25 +946,25 @@ body:             |
     ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_mfma_def_using_instr_blocks_unpacking
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_mfma_def_using_instr_blocks_unpacking
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -1041,26 +1037,26 @@ body:             |
     ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_unpacking_with_imm_input
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_unpacking_with_imm_input
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     S_WAITCNT 49279
@@ -1134,26 +1130,26 @@ body:             |
     ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
-    ; GFX90a-LABEL: name: test_neg_lo_hi_post_unpacking
-    ; GFX90a: liveins: $sgpr4_sgpr5
-    ; GFX90a-NEXT: {{  $}}
-    ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
-    ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
-    ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
-    ; GFX90a-NEXT: S_WAITCNT 49279
-    ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
-    ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
-    ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
-    ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX90a-NEXT: S_ENDPGM 0
+    ; GFX90A-LABEL: name: test_neg_lo_hi_post_unpacking
+    ; GFX90A: liveins: $sgpr4_sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX90A-NEXT: S_WAITCNT 49279
+    ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+    ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
     S_WAITCNT 49279
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
new file mode 100644
index 0000000..a2d6ca9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -0,0 +1,39 @@
+;; Test if a potential indirect call target function which has internal linkage and
+;; address taken has its type ID emitted to callgraph section.
+;; This test also makes sure that callback functions which meet the above constraint
+;; are handled correctly.
+
+; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -o - < %s | FileCheck %s
+
+declare !type !0 void @_Z6doWorkPFviE(ptr)
+
+define i32 @_Z4testv() !type !1 {
+entry:
+  call void @_Z6doWorkPFviE(ptr nonnull @_ZL10myCallbacki)
+  ret i32 0
+}
+
+; CHECK: _ZL10myCallbacki:
+; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
+define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
+entry:
+  %sink = alloca i32, align 4
+  store volatile i32 %value, ptr %sink, align 4
+  %i1 = load volatile i32, ptr %sink, align 4
+  ret void
+}
+
+!0 = !{i64 0, !"_ZTSFvPFviEE.generalized"}
+!1 = !{i64 0, !"_ZTSFivE.generalized"}
+!2 = !{i64 0, !"_ZTSFviE.generalized"}
+
+; CHECK: .section .callgraph,"o",%progbits,.text
+;; Version
+; CHECK-NEXT: .byte   0
+;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
+; CHECK-NEXT: .byte   1
+;; Function Entry PC
+; CHECK-NEXT: .long   [[LABEL_FUNC]]
+;; Function type ID -5212364466660467813
+; CHECK-NEXT: .long	1154849691
+; CHECK-NEXT: .long	3081369122
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
new file mode 100644
index 0000000..bf5249e
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -0,0 +1,63 @@
+;; Test if temporary labels are generated for each indirect callsite.
+;; Test if the .callgraph section contains the MD5 hash of callees' type (type id)
+;; is correctly paired with its corresponding temporary label generated for indirect
+;; call sites annotated with !callee_type metadata.
+;; Test if the .callgraph section contains unique direct callees.
+
+; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -o - < %s | FileCheck %s
+
+declare !type !0 void @direct_foo()
+declare !type !1 i32 @direct_bar(i8)
+declare !type !2 ptr @direct_baz(ptr)
+
+; CHECK: ball:
+; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
+define ptr @ball() {
+entry:
+  call void @direct_foo()
+  %fp_foo_val = load ptr, ptr null, align 8   
+  call void (...) %fp_foo_val(), !callee_type !0   
+  call void @direct_foo()
+  %fp_bar_val = load ptr, ptr null, align 8  
+  %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !2  
+  %call_fp_bar_direct = call i32 @direct_bar(i8 1)
+  %fp_baz_val = load ptr, ptr null, align 8
+  %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4
+  call void @direct_foo()
+  %call_fp_baz_direct = call ptr @direct_baz(ptr null)
+  call void @direct_foo()
+  ret ptr %call_fp_baz
+}
+
+!0 = !{!1}
+!1 = !{i64 0, !"_ZTSFvE.generalized"}
+!2 = !{!3}
+!3 = !{i64 0, !"_ZTSFicE.generalized"}
+!4 = !{!5}
+!5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
+
+; CHECK: .section .callgraph,"o",%progbits,.text
+;; Version
+; CHECK-NEXT: .byte   0
+;; Flags
+; CHECK-NEXT: .byte   7
+;; Function Entry PC
+; CHECK-NEXT: .long   [[LABEL_FUNC]]
+;; Function type ID -- set to 0 as no type metadata attached to function.
+; CHECK-NEXT: .long   0
+; CHECK-NEXT: .long   0
+;; Number of unique direct callees.
+; CHECK-NEXT: .byte	  3
+;; Direct callees.
+; CHECK-NEXT: .long	direct_foo
+; CHECK-NEXT: .long	direct_bar
+; CHECK-NEXT: .long	direct_baz
+;; Number of unique indirect target type IDs.
+; CHECK-NEXT: .byte   3
+;; Indirect type IDs.
+; CHECK-NEXT: .long 838288420
+; CHECK-NEXT: .long 1053552373
+; CHECK-NEXT: .long 1505527380
+; CHECK-NEXT: .long 814631809
+; CHECK-NEXT: .long 342417018
+; CHECK-NEXT: .long 2013108216
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
new file mode 100644
index 0000000..d577603
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
@@ -0,0 +1,34 @@
+;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls.
+
+; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
+; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+
+define i32 @check_tailcall(ptr %func, i8 %x) !type !0 {
+entry:
+  %call = tail call i32 %func(i8 signext %x), !callee_type !1
+  ret i32 %call
+}
+
+define i32 @main(i32 %argc) !type !3 {
+entry:
+  %andop = and i32 %argc, 1
+  %cmp = icmp eq i32 %andop, 0
+  %foo.bar = select i1 %cmp, ptr @foo, ptr @bar
+  %call.i = tail call i32 %foo.bar(i8 signext 97), !callee_type !1
+  ret i32 %call.i
+}
+
+declare !type !2 i32 @foo(i8 signext)
+
+declare !type !2 i32 @bar(i8 signext)
+
+!0 = !{i64 0, !"_ZTSFiPvcE.generalized"}
+!1 = !{!2}
+!2 = !{i64 0, !"_ZTSFicE.generalized"}
+!3 = !{i64 0, !"_ZTSFiiE.generalized"}
+
+; CHECK:      Hex dump of section '.callgraph':
+; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154
+; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8
+;; Verify that the type id 0x308e4b8159bc8654 is in section.
+; CHECK-NEXT: 0x00000020 3e0cfe3c b2015486 bc59814b 8e30
diff --git a/llvm/test/CodeGen/ARM/call-graph-section.ll b/llvm/test/CodeGen/ARM/call-graph-section.ll
new file mode 100644
index 0000000..928a1067
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/call-graph-section.ll
@@ -0,0 +1,37 @@
+;; Tests that we store the type identifiers in .callgraph section of the object file.
+
+; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
+; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+
+declare !type !0 void @foo()
+
+declare !type !1 i32 @bar(i8)
+
+declare !type !2 ptr @baz(ptr)
+
+define void @main() {
+entry:
+  %fp_foo_val = load ptr, ptr null, align 8
+  call void (...) %fp_foo_val(), !callee_type !1
+  %fp_bar_val = load ptr, ptr null, align 8
+  %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !3
+  %fp_baz_val = load ptr, ptr null, align 8
+  %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4
+  ret void
+}
+
+;; Check that the numeric type id (md5 hash) for the below type ids are emitted
+;; to the callgraph section.
+!0 = !{i64 0, !"_ZTSFvE.generalized"}
+!1 = !{!0}
+!2 = !{i64 0, !"_ZTSFicE.generalized"}
+!3 = !{!2}
+!4 = !{!5}
+!5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
+
+;; Make sure following type IDs are in call graph section
+;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814
+; CHECK: Hex dump of section '.callgraph':
+; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000324
+; CHECK-NEXT: 0x00000010 44f731f5 eecb3e54 86bc5981 4b8e307a
+; CHECK-NEXT: 0x00000020 de6814f8 97fd77
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 800ee87..8230e47 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1572,26 +1572,11 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
 }
 
 define void @test_fabs(ptr %p) {
-; CHECK-FP16-LABEL: test_fabs:
-; CHECK-FP16:         ldrh r1, [r0]
-; CHECK-FP16-NEXT:    vmov s0, r1
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT:    vabs.f32 s0, s0
-; CHECK-FP16-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT:    vmov r1, s0
-; CHECK-FP16-NEXT:    strh r1, [r0]
-; CHECK-FP16-NEXT:    bx lr
-;
-; CHECK-LIBCALL-LABEL: test_fabs:
-; CHECK-LIBCALL:         .save {r4, lr}
-; CHECK-LIBCALL-NEXT:    push {r4, lr}
-; CHECK-LIBCALL-NEXT:    mov r4, r0
-; CHECK-LIBCALL-NEXT:    ldrh r0, [r0]
-; CHECK-LIBCALL-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-NEXT:    bic r0, r0, #-2147483648
-; CHECK-LIBCALL-NEXT:    bl __aeabi_f2h
-; CHECK-LIBCALL-NEXT:    strh r0, [r4]
-; CHECK-LIBCALL-NEXT:    pop {r4, pc}
+; CHECK-ALL-LABEL: test_fabs:
+; CHECK-ALL:         ldrh r1, [r0]
+; CHECK-ALL-NEXT:    bfc r1, #15, #17
+; CHECK-ALL-NEXT:    strh r1, [r0]
+; CHECK-ALL-NEXT:    bx lr
   %a = load half, ptr %p, align 2
   %r = call half @llvm.fabs.f16(half %a)
   store half %r, ptr %p
@@ -2454,26 +2439,11 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
 }
 
 define void @test_fneg(ptr %p1, ptr %p2) #0 {
-; CHECK-FP16-LABEL: test_fneg:
-; CHECK-FP16:         ldrh r0, [r0]
-; CHECK-FP16-NEXT:    vmov s0, r0
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT:    vneg.f32 s0, s0
-; CHECK-FP16-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT:    vmov r0, s0
-; CHECK-FP16-NEXT:    strh r0, [r1]
-; CHECK-FP16-NEXT:    bx lr
-;
-; CHECK-LIBCALL-LABEL: test_fneg:
-; CHECK-LIBCALL:         .save {r4, lr}
-; CHECK-LIBCALL-NEXT:    push {r4, lr}
-; CHECK-LIBCALL-NEXT:    ldrh r0, [r0]
-; CHECK-LIBCALL-NEXT:    mov r4, r1
-; CHECK-LIBCALL-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-NEXT:    eor r0, r0, #-2147483648
-; CHECK-LIBCALL-NEXT:    bl __aeabi_f2h
-; CHECK-LIBCALL-NEXT:    strh r0, [r4]
-; CHECK-LIBCALL-NEXT:    pop {r4, pc}
+; CHECK-ALL-LABEL: test_fneg:
+; CHECK-ALL:         ldrh r0, [r0]
+; CHECK-ALL-NEXT:    eor r0, r0, #32768
+; CHECK-ALL-NEXT:    strh r0, [r1]
+; CHECK-ALL-NEXT:    bx lr
   %v = load half, ptr %p1, align 2
   %res = fneg half %v
   store half %res, ptr %p2, align 2
diff --git a/llvm/test/CodeGen/Generic/bfloat-op.ll b/llvm/test/CodeGen/Generic/bfloat-op.ll
new file mode 100644
index 0000000..d593328
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/bfloat-op.ll
@@ -0,0 +1,104 @@
+; Same as `bfloat.ll`, but for `fneg`, `fabs`, `copysign` and `fma`.
+; Can be merged back into `bfloat.ll` once they have the same platform coverage.
+; Once all targets are fixed, the `CHECK-*` prefixes should all be merged into a single `CHECK` prefix and the `BAD-*` prefixes should be removed.
+
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-apple-darwin            | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if amdgpu-registered-target      %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa               | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if arc-registered-target         %{ llc %s -o - -mtriple=arc-elf                         | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi       | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=thumbv7em-none-eabi             | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if avr-registered-target         %{ llc %s -o - -mtriple=avr-none                        | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; FIXME: BPF has a compiler error
+; RUN: %if csky-registered-target        %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2     | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; FIXME: hard float csky crashes
+; FIXME: directx has a compiler error
+; FIXME: hexagon crashes
+; RUN: %if lanai-registered-target       %{ llc %s -o - -mtriple=lanai-unknown-unknown           | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if m68k-registered-target        %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; FIXME: mips crashes
+; RUN: %if msp430-registered-target      %{ llc %s -o - -mtriple=msp430-none-elf                 | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if nvptx-registered-target       %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda             | FileCheck %s --check-prefixes=NOCRASH %}
+; FIXME: powerpc crashes
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; FIXME: sparc crashes
+; FIXME: spirv crashes
+; FIXME: s390x crashes
+; FIXME: ve crashes
+; FIXME: wasm crashes
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=i686-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc          | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if xcore-registered-target       %{ llc %s -o - -mtriple=xcore-unknown-unknown           | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if xtensa-registered-target      %{ llc %s -o - -mtriple=xtensa-none-elf                 | FileCheck %s --check-prefixes=ALL,BAD-COPYSIGN,CHECK-FMA %}
+
+; Note that arm64ec labels are quoted, hence the `{{"?}}:`.
+
+; Codegen tests don't work the same for graphics targets. Add a dummy directive
+; for filecheck, just make sure we don't crash.
+; NOCRASH: {{.*}}
+
+; fneg, fabs and copysign all need to not quieten signalling NaNs, so should not call any conversion functions which do.
+; These tests won't catch cases where the everything is done using native instructions instead of builtins.
+
+define void @test_fneg(ptr %p1, ptr %p2) #0 {
+; ALL-LABEL: test_fneg{{"?}}:
+; ALL-NEG-NOT: __extend
+; ALL-NEG-NOT: __trunc
+; ALL-NEG-NOT: __gnu
+; ALL-NEG-NOT: __aeabi
+  %v = load bfloat, ptr %p1
+  %res = fneg bfloat %v
+  store bfloat %res, ptr %p2
+  ret void
+}
+
+define void @test_fabs(ptr %p1, ptr %p2) {
+; ALL-LABEL: test_fabs{{"?}}:
+; ALL-ABS-NOT: __extend
+; ALL-ABS-NOT: __trunc
+; ALL-ABS-NOT: __gnu
+; ALL-ABS-NOT: __aeabi
+  %a = load bfloat, ptr %p1
+  %r = call bfloat @llvm.fabs.f16(bfloat %a)
+  store bfloat %r, ptr %p2
+  ret void
+}
+
+define void @test_copysign(ptr %p1, ptr %p2, ptr %p3) {
+; ALL-LABEL: test_copysign{{"?}}:
+; CHECK-COPYSIGN-NOT: __extend
+; CHECK-COPYSIGN-NOT: __trunc
+; CHECK-COPYSIGN-NOT: __gnu
+; CHECK-COPYSIGN-NOT: __aeabi
+; BAD-COPYSIGN: __truncsfbf2
+  %a = load bfloat, ptr %p1
+  %b = load bfloat, ptr %p2
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+  store bfloat %r, ptr %p3
+  ret void
+}
+
+; There is no floating-point type LLVM supports that is large enough to promote bfloat FMA to
+; without causing double rounding issues. This checks for libcalls to f32/f64 fma and truncating
+; f32/f64 to bf16. See https://github.com/llvm/llvm-project/issues/131531
+
+define void @test_fma(ptr %p1, ptr %p2, ptr %p3, ptr %p4) {
+; ALL-LABEL: test_fma{{"?}}:
+; CHECK-FMA-NOT: {{\bfmaf?\b}}
+; CHECK-FMA-NOT: __truncsfbf2
+; CHECK-FMA-NOT: __truncdfbf2
+; BAD-FMA: {{__truncsfbf2|\bfmaf?\b}}
+  %a = load bfloat, ptr %p1
+  %b = load bfloat, ptr %p2
+  %c = load bfloat, ptr %p3
+  %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
+  store bfloat %r, ptr %p4
+  ret void
+}
diff --git a/llvm/test/CodeGen/Generic/bfloat.ll b/llvm/test/CodeGen/Generic/bfloat.ll
new file mode 100644
index 0000000..83c6711
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/bfloat.ll
@@ -0,0 +1,75 @@
+; Simple cross-platform smoke checks for basic bf16 operations.
+;
+; There shouldn't be any architectures that crash when trying to use `bfloat`;
+; check that here. Additionally do a small handful of smoke tests that work
+; well cross-platform.
+
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-apple-darwin            | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK %}
+; FIXME: arm64ec crashes when passing/returning bfloat
+; RUN: %if amdgpu-registered-target      %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa               | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if arc-registered-target         %{ llc %s -o - -mtriple=arc-elf                         | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi       | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=thumbv7em-none-eabi             | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if avr-registered-target         %{ llc %s -o - -mtriple=avr-none                        | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if bpf-registered-target         %{ llc %s -o - -mtriple=bpfel                           | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if csky-registered-target        %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2     | FileCheck %s --check-prefixes=ALL,CHECK %}
+; FIXME: hard float csky crashes
+; RUN: %if directx-registered-target     %{ llc %s -o - -mtriple=dxil-pc-shadermodel6.3-library  | FileCheck %s --check-prefixes=NOCRASH %}
+; FIXME: hexagon crashes
+; RUN: %if lanai-registered-target       %{ llc %s -o - -mtriple=lanai-unknown-unknown           | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if m68k-registered-target        %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK %}
+; FIXME: mips crashes
+; RUN: %if msp430-registered-target      %{ llc %s -o - -mtriple=msp430-none-elf                 | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if nvptx-registered-target       %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda             | FileCheck %s --check-prefixes=NOCRASH   %}
+; FIXME: powerpc crashes
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK %}
+; FIXME: sparc crashes
+; FIXME: spirv crashes
+; FIXME: s390x crashes
+; FIXME: ve crashes
+; FIXME: wasm crashes
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=i686-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,BAD %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc          | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if xcore-registered-target       %{ llc %s -o - -mtriple=xcore-unknown-unknown           | FileCheck %s --check-prefixes=ALL,CHECK %}
+; RUN: %if xtensa-registered-target      %{ llc %s -o - -mtriple=xtensa-none-elf                 | FileCheck %s --check-prefixes=ALL,CHECK %}
+
+; Note that arm64ec labels are quoted, hence the `{{"?}}:`.
+
+; Codegen tests don't work the same for graphics targets. Add a dummy directive
+; for filecheck, just make sure we don't crash.
+; NOCRASH: {{.*}}
+
+; All backends need to be able to bitcast without converting to another format,
+; so we assert against libcalls (specifically __truncsfbf2). This won't catch hardware conversions.
+
+define bfloat @from_bits(i16 %bits) nounwind {
+; ALL-LABEL: from_bits{{"?}}:
+; ALL-NOT: __extend
+; ALL-NOT: __trunc
+; ALL-NOT: __gnu
+    %f = bitcast i16 %bits to bfloat
+    ret bfloat %f
+}
+
+define i16 @to_bits(bfloat %f) nounwind {
+; ALL-LABEL: to_bits{{"?}}:
+; CHECK-NOT: __extend
+; CHECK-NOT: __trunc
+; CHECK-NOT: __gnu
+; BAD:       __truncsfbf2
+    %bits = bitcast bfloat %f to i16
+    ret i16 %bits
+}
+
+define bfloat @check_freeze(bfloat %f) nounwind {
+; ALL-LABEL: check_freeze{{"?}}:
+  %t0 = freeze bfloat %f
+  ret bfloat %t0
+}
diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll
new file mode 100644
index 0000000..1037d8e
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/half-op.ll
@@ -0,0 +1,115 @@
+; Same as `half.ll`, but for `fneg`, `fabs`, `copysign` and `fma`.
+; Can be merged back into `half.ll` once BPF doesn't have a compiler error.
+; Once all targets are fixed, the `CHECK-*` prefixes should all be merged into a single `CHECK` prefix and the `BAD-*` prefixes should be removed.
+
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-apple-darwin            | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if amdgpu-registered-target      %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa               | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if arc-registered-target         %{ llc %s -o - -mtriple=arc-elf                         | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=thumbv7em-none-eabi             | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if avr-registered-target         %{ llc %s -o - -mtriple=avr-none                        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; FIXME: BPF has a compiler error
+; RUN: %if csky-registered-target        %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2     | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if csky-registered-target        %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 -mcpu=ck860fv -mattr=+hard-float | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; FIXME: directx has a compiler error
+; RUN: %if hexagon-registered-target     %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl      | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if lanai-registered-target       %{ llc %s -o - -mtriple=lanai-unknown-unknown           | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if m68k-registered-target        %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if msp430-registered-target      %{ llc %s -o - -mtriple=msp430-none-elf                 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if nvptx-registered-target       %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda             | FileCheck %s --check-prefixes=NOCRASH %}
+; RUN: %if powerpc-registered-target     %{ llc %s -o - -mtriple=powerpc-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if powerpc-registered-target     %{ llc %s -o - -mtriple=powerpc64-unknown-linux-gnu     | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if powerpc-registered-target     %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if spirv-registered-target       %{ llc %s -o - -mtriple=spirv-unknown-unknown           | FileCheck %s --check-prefixes=NOCRASH %}
+; RUN: %if systemz-registered-target     %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if ve-registered-target          %{ llc %s -o - -mtriple=ve-unknown-unknown              | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown          | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=i686-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if xcore-registered-target       %{ llc %s -o - -mtriple=xcore-unknown-unknown           | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
+; RUN: %if xtensa-registered-target      %{ llc %s -o - -mtriple=xtensa-none-elf                 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,CHECK-FMA %}
+
+; Note that arm64ec labels are quoted, hence the `{{"?}}:`.
+
+; Codegen tests don't work the same for graphics targets. Add a dummy directive
+; for filecheck, just make sure we don't crash.
+; NOCRASH: {{.*}}
+
+; fneg, fabs and copysign all need to not quieten signalling NaNs, so should not call any conversion functions which do.
+; These tests won't catch cases where the everything is done using native instructions instead of builtins.
+; See https://github.com/llvm/llvm-project/issues/104915
+
+define void @test_fneg(ptr %p1, ptr %p2) #0 {
+; ALL-LABEL: test_fneg{{"?}}:
+; CHECK-NEG-ABS-NOT: __extend
+; CHECK-NEG-ABS-NOT: __trunc
+; CHECK-NEG-ABS-NOT: __gnu
+; CHECK-NEG-ABS-NOT: __aeabi
+; BAD-NEG-ABS: {{__extendhfsf2|__gnu_h2f_ieee|__aeabi_h2f}}
+  %v = load half, ptr %p1
+  %res = fneg half %v
+  store half %res, ptr %p2
+  ret void
+}
+
+define void @test_fabs(ptr %p1, ptr %p2) {
+; ALL-LABEL: test_fabs{{"?}}:
+; CHECK-NEG-ABS-NOT: __extend
+; CHECK-NEG-ABS-NOT: __trunc
+; CHECK-NEG-ABS-NOT: __gnu
+; CHECK-NEG-ABS-NOT: __aeabi
+; BAD-NEG-ABS: {{__extendhfsf2|__gnu_h2f_ieee|__aeabi_h2f}}
+  %a = load half, ptr %p1
+  %r = call half @llvm.fabs.f16(half %a)
+  store half %r, ptr %p2
+  ret void
+}
+
+define void @test_copysign(ptr %p1, ptr %p2, ptr %p3) {
+; ALL-LABEL: test_copysign{{"?}}:
+; CHECK-COPYSIGN-NOT: __extend
+; CHECK-COPYSIGN-NOT: __trunc
+; CHECK-COPYSIGN-NOT: __gnu
+; CHECK-COPYSIGN-NOT: __aeabi
+; BAD-COPYSIGN: {{__extendhfsf2|__gnu_h2f_ieee}}
+  %a = load half, ptr %p1
+  %b = load half, ptr %p2
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  store half %r, ptr %p3
+  ret void
+}
+
+; If promoting, fma must promote at least to f64 to avoid double rounding issues.
+; This checks for calls to f32 fmaf and truncating f32 to f16.
+; See https://github.com/llvm/llvm-project/issues/98389
+
+define void @test_fma(ptr %p1, ptr %p2, ptr %p3, ptr %p4) {
+; ALL-LABEL: test_fma{{"?}}:
+; Allow fmaf16
+; CHECK-FMA-NOT: fmaf{{\b}}
+; CHECK-FMA-NOT: __truncsfhf2
+; CHECK-FMA-NOT: __gnu_f2h_ieee
+; CHECK-FMA-NOT: __aeabi_f2h
+; BAD-FMA: {{__truncsfhf2|__gnu_f2h_ieee|__aeabi_f2h|fmaf\b}}
+  %a = load half, ptr %p1
+  %b = load half, ptr %p2
+  %c = load half, ptr %p3
+  %r = call half @llvm.fma.f16(half %a, half %b, half %c)
+  store half %r, ptr %p4
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll
index 39f9269..4998d87 100644
--- a/llvm/test/CodeGen/PowerPC/memcmp.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmp.ll
@@ -6,12 +6,10 @@ define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture reado
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ldbrx 3, 0, 3
 ; CHECK-NEXT:    ldbrx 4, 0, 4
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    subc 3, 4, 3
-; CHECK-NEXT:    subfe 3, 4, 4
-; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    extsw 3, 3
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8)
@@ -23,11 +21,11 @@ define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture reado
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lwbrx 3, 0, 3
 ; CHECK-NEXT:    lwbrx 4, 0, 4
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    sub 5, 4, 3
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
+; CHECK-NEXT:    extsw 3, 3
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
   ret i32 %call
diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll
index d2dff6e..4d393dd 100644
--- a/llvm/test/CodeGen/PowerPC/ucmp.ll
+++ b/llvm/test/CodeGen/PowerPC/ucmp.ll
@@ -4,12 +4,10 @@
 define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp_8_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    sub 5, 4, 3
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
   ret i8 %1
@@ -18,12 +16,10 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
 define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp_8_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    sub 5, 4, 3
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
   ret i8 %1
@@ -32,14 +28,10 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
 define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp_8_32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    clrldi 5, 4, 32
-; CHECK-NEXT:    clrldi 6, 3, 32
-; CHECK-NEXT:    sub 5, 5, 6
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
   ret i8 %1
@@ -48,12 +40,10 @@ define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
 define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: ucmp_8_64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    subc 3, 4, 3
-; CHECK-NEXT:    subfe 3, 4, 4
-; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
   ret i8 %1
@@ -82,14 +72,10 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
 define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp_32_32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    clrldi 5, 4, 32
-; CHECK-NEXT:    clrldi 6, 3, 32
-; CHECK-NEXT:    sub 5, 5, 6
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -98,12 +84,10 @@ define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
 define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: ucmp_32_64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    subc 3, 4, 3
-; CHECK-NEXT:    subfe 3, 4, 4
-; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
   ret i32 %1
@@ -112,12 +96,10 @@ define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
 define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: ucmp_64_64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subc 5, 4, 3
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    subfe 5, 4, 4
-; CHECK-NEXT:    neg 5, 5
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
   ret i64 %1
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index 2ebb6e9..d089e36 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -514,6 +514,7 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a1, 16
 ; RV32I-NEXT:    addi s1, a1, -1
 ; RV32I-NEXT:    and a0, a0, s1
@@ -521,13 +522,12 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    lui a1, 8
+; RV32I-NEXT:    xor s2, a0, a1
 ; RV32I-NEXT:    and a0, a0, s1
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a0, 524288
-; RV32I-NEXT:    xor a0, s0, a0
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    and a0, a0, s1
+; RV32I-NEXT:    and a0, s2, s1
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    mv a0, s0
@@ -536,6 +536,7 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
@@ -545,6 +546,7 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lui a1, 16
 ; RV64I-NEXT:    addi s1, a1, -1
 ; RV64I-NEXT:    and a0, a0, s1
@@ -552,13 +554,12 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    xor s2, a0, a1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lui a0, 524288
-; RV64I-NEXT:    xor a0, s0, a0
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    and a0, a0, s1
+; RV64I-NEXT:    and a0, s2, s1
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    mv a0, s0
@@ -567,6 +568,7 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
@@ -638,11 +640,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    xor a0, a0, a1
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    not a0, a0
 ; RV32I-NEXT:    lui a1, 1048568
 ; RV32I-NEXT:    slli s1, s1, 17
 ; RV32I-NEXT:    and a0, a0, a1
@@ -677,11 +675,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    and a0, a0, s3
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    lui a1, 1048568
 ; RV64I-NEXT:    slli s1, s1, 49
 ; RV64I-NEXT:    and a0, a0, a1
@@ -804,15 +798,14 @@ define half @fabs_h(half %a, half %b) nounwind {
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    slli s0, a0, 17
+; RV32I-NEXT:    srli s0, s0, 17
 ; RV32I-NEXT:    and a0, a0, s2
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    and a0, a0, s2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -841,15 +834,14 @@ define half @fabs_h(half %a, half %b) nounwind {
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    slli s0, a0, 49
+; RV64I-NEXT:    srli s0, s0, 49
 ; RV64I-NEXT:    and a0, a0, s2
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    and a0, a0, s2
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a1, s0
+; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -1217,25 +1209,21 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a0, 16
-; RV32I-NEXT:    addi s3, a0, -1
-; RV32I-NEXT:    and a0, a2, s3
+; RV32I-NEXT:    addi s2, a0, -1
+; RV32I-NEXT:    and a0, a2, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    xor a0, a0, a1
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    lui a1, 8
+; RV32I-NEXT:    xor s3, a0, a1
+; RV32I-NEXT:    and a0, s1, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
+; RV32I-NEXT:    and a0, s0, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    and a0, s2, s3
+; RV32I-NEXT:    and a0, s3, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    mv a0, s1
@@ -1261,25 +1249,21 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    lui a0, 16
-; RV64I-NEXT:    addi s3, a0, -1
-; RV64I-NEXT:    and a0, a2, s3
+; RV64I-NEXT:    addi s2, a0, -1
+; RV64I-NEXT:    and a0, a2, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    and a0, a0, s3
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s2, a0
-; RV64I-NEXT:    and a0, s1, s3
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    xor s3, a0, a1
+; RV64I-NEXT:    and a0, s1, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s0, s3
+; RV64I-NEXT:    and a0, s0, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    and a0, s2, s3
+; RV64I-NEXT:    and a0, s3, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv a2, a0
 ; RV64I-NEXT:    mv a0, s1
@@ -1355,43 +1339,34 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    lui s3, 16
-; RV32I-NEXT:    addi s3, s3, -1
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi s3, a1, -1
 ; RV32I-NEXT:    and a0, a0, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    and a0, s0, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s2, s3
-; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    lui s4, 524288
-; RV32I-NEXT:    xor a0, a0, s4
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lui a1, 8
+; RV32I-NEXT:    xor s2, s2, a1
+; RV32I-NEXT:    xor s4, a0, a1
 ; RV32I-NEXT:    and a0, s1, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    xor a0, a0, s4
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
-; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    and a0, s2, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    and a0, s4, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call fmaf
 ; RV32I-NEXT:    call __truncsfhf2
@@ -1413,43 +1388,34 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a2
-; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    lui s3, 16
-; RV64I-NEXT:    addi s3, s3, -1
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addi s3, a1, -1
 ; RV64I-NEXT:    and a0, a0, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
 ; RV64I-NEXT:    mv s2, a0
-; RV64I-NEXT:    and a0, s1, s3
+; RV64I-NEXT:    and a0, s0, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s2, s3
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    lui s4, 524288
-; RV64I-NEXT:    xor a0, a0, s4
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    xor s2, s2, a1
+; RV64I-NEXT:    xor s4, a0, a1
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    xor a0, a0, s4
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s0, s3
-; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s2, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv s2, a0
-; RV64I-NEXT:    and a0, s1, s3
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    and a0, s4, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call fmaf
 ; RV64I-NEXT:    call __truncsfhf2
@@ -1535,44 +1501,35 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s1, a2
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui s3, 16
-; RV32I-NEXT:    addi s3, s3, -1
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lui a0, 16
+; RV32I-NEXT:    addi s3, a0, -1
 ; RV32I-NEXT:    and a0, a1, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    and a0, s0, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s2, s3
-; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    lui s4, 524288
-; RV32I-NEXT:    xor a0, a0, s4
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lui a1, 8
+; RV32I-NEXT:    xor s2, s2, a1
+; RV32I-NEXT:    xor s4, a0, a1
 ; RV32I-NEXT:    and a0, s1, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    xor a0, a0, s4
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
-; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    and a0, s2, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    and a0, s4, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call fmaf
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -1593,44 +1550,35 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s1, a2
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lui s3, 16
-; RV64I-NEXT:    addi s3, s3, -1
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lui a0, 16
+; RV64I-NEXT:    addi s3, a0, -1
 ; RV64I-NEXT:    and a0, a1, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
 ; RV64I-NEXT:    mv s2, a0
-; RV64I-NEXT:    and a0, s1, s3
+; RV64I-NEXT:    and a0, s0, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s2, s3
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    lui s4, 524288
-; RV64I-NEXT:    xor a0, a0, s4
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    xor s2, s2, a1
+; RV64I-NEXT:    xor s4, a0, a1
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    xor a0, a0, s4
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s0, s3
-; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s2, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv s2, a0
-; RV64I-NEXT:    and a0, s1, s3
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    and a0, s4, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv a2, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s2
+; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call fmaf
 ; RV64I-NEXT:    call __truncsfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -1960,25 +1908,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi s3, a1, -1
-; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    addi s2, a1, -1
+; RV32I-NEXT:    and a0, a0, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    xor a0, a0, a1
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    lui a1, 8
+; RV32I-NEXT:    xor s3, a0, a1
+; RV32I-NEXT:    and a0, s1, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
+; RV32I-NEXT:    and a0, s0, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    and a0, s2, s3
+; RV32I-NEXT:    and a0, s3, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    mv a2, s0
@@ -2003,25 +1947,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    mv s0, a2
 ; RV64I-NEXT:    mv s1, a1
 ; RV64I-NEXT:    lui a1, 16
-; RV64I-NEXT:    addi s3, a1, -1
-; RV64I-NEXT:    and a0, a0, s3
+; RV64I-NEXT:    addi s2, a1, -1
+; RV64I-NEXT:    and a0, a0, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    and a0, a0, s3
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s2, a0
-; RV64I-NEXT:    and a0, s1, s3
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    xor s3, a0, a1
+; RV64I-NEXT:    and a0, s1, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s0, s3
+; RV64I-NEXT:    and a0, s0, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    and a0, s2, s3
+; RV64I-NEXT:    and a0, s3, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    mv a2, s0
@@ -2096,25 +2036,21 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a0, 16
-; RV32I-NEXT:    addi s3, a0, -1
-; RV32I-NEXT:    and a0, a1, s3
+; RV32I-NEXT:    addi s2, a0, -1
+; RV32I-NEXT:    and a0, a1, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    xor a0, a0, a1
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    lui a1, 8
+; RV32I-NEXT:    xor s3, a0, a1
+; RV32I-NEXT:    and a0, s1, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
+; RV32I-NEXT:    and a0, s0, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    and a0, s2, s3
+; RV32I-NEXT:    and a0, s3, s2
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    mv a0, s1
@@ -2140,25 +2076,21 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    mv s0, a2
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    lui a0, 16
-; RV64I-NEXT:    addi s3, a0, -1
-; RV64I-NEXT:    and a0, a1, s3
+; RV64I-NEXT:    addi s2, a0, -1
+; RV64I-NEXT:    and a0, a1, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    and a0, a0, s3
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s2, a0
-; RV64I-NEXT:    and a0, s1, s3
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    xor s3, a0, a1
+; RV64I-NEXT:    and a0, s1, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s0, s3
+; RV64I-NEXT:    and a0, s0, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    and a0, s2, s3
+; RV64I-NEXT:    and a0, s3, s2
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    mv a0, s1
@@ -2519,12 +2451,8 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __mulsf3
 ; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    xor a0, a0, a1
-; RV32I-NEXT:    call __truncsfhf2
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lui a1, 8
+; RV32I-NEXT:    xor s1, a0, a1
 ; RV32I-NEXT:    and a0, s0, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    mv s0, a0
@@ -2580,12 +2508,8 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __mulsf3
 ; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    and a0, a0, s3
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    call __truncsfhf2
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    xor s1, a0, a1
 ; RV64I-NEXT:    and a0, s0, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    mv s0, a0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll
index 8d52fe5..3c35a29 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll
@@ -63,34 +63,30 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) {
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-MVE-NEXT:    mov r4, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-MVE-NEXT:    vmov q5, q1
 ; CHECK-MVE-NEXT:    vmov q4, q0
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
 ; CHECK-MVE-NEXT:    mov r5, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[1]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
 ; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
 ; CHECK-MVE-NEXT:    mov r5, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[0]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
 ; CHECK-MVE-NEXT:    mov r6, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[0]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
 ; CHECK-MVE-NEXT:    mov r1, r6
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    vmov.16 q6[0], r5
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    vmov.16 q6[1], r0
+; CHECK-MVE-NEXT:    bfc r0, #15, #17
+; CHECK-MVE-NEXT:    bfc r5, #15, #17
+; CHECK-MVE-NEXT:    vmov.16 q6[0], r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-MVE-NEXT:    vmov.16 q6[1], r5
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
 ; CHECK-MVE-NEXT:    mov r5, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q4[2]
@@ -98,9 +94,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) {
 ; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
+; CHECK-MVE-NEXT:    bfc r0, #15, #17
 ; CHECK-MVE-NEXT:    vmov.16 q6[2], r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q5[3]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
@@ -110,9 +104,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) {
 ; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
+; CHECK-MVE-NEXT:    bfc r0, #15, #17
 ; CHECK-MVE-NEXT:    vmov.16 q6[3], r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q5[4]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
@@ -122,9 +114,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) {
 ; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
+; CHECK-MVE-NEXT:    bfc r0, #15, #17
 ; CHECK-MVE-NEXT:    vmov.16 q6[4], r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q5[5]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
@@ -134,9 +124,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) {
 ; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
+; CHECK-MVE-NEXT:    bfc r0, #15, #17
 ; CHECK-MVE-NEXT:    vmov.16 q6[5], r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q5[6]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
@@ -146,9 +134,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) {
 ; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
+; CHECK-MVE-NEXT:    bfc r0, #15, #17
 ; CHECK-MVE-NEXT:    vmov.16 q6[6], r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q5[7]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
@@ -158,9 +144,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) {
 ; CHECK-MVE-NEXT:    mov r1, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_fsub
 ; CHECK-MVE-NEXT:    bl __aeabi_f2h
-; CHECK-MVE-NEXT:    bl __aeabi_h2f
-; CHECK-MVE-NEXT:    bic r0, r0, #-2147483648
-; CHECK-MVE-NEXT:    bl __aeabi_f2h
+; CHECK-MVE-NEXT:    bfc r0, #15, #17
 ; CHECK-MVE-NEXT:    vmov.16 q6[7], r0
 ; CHECK-MVE-NEXT:    vstrw.32 q6, [r4]
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
new file mode 100644
index 0000000..2aea9c1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -0,0 +1,38 @@
+;; Test if a potential indirect call target function which has internal linkage and
+;; address taken has its type ID emitted to callgraph section.
+;; This test also makes sure that callback functions which meet the above constraint
+;; are handled correctly.
+
+; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -o - < %s | FileCheck %s
+
+declare !type !0 void @_Z6doWorkPFviE(ptr)
+
+define i32 @_Z4testv() !type !1 {
+entry:
+  call void @_Z6doWorkPFviE(ptr nonnull @_ZL10myCallbacki)
+  ret i32 0
+}
+
+; CHECK: _ZL10myCallbacki:
+; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
+define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
+entry:
+  %sink = alloca i32, align 4
+  store volatile i32 %value, ptr %sink, align 4
+  %i1 = load volatile i32, ptr %sink, align 4
+  ret void
+}
+
+!0 = !{i64 0, !"_ZTSFvPFviEE.generalized"}
+!1 = !{i64 0, !"_ZTSFivE.generalized"}
+!2 = !{i64 0, !"_ZTSFviE.generalized"}
+
+; CHECK: .section .callgraph,"o",@progbits,.text
+;; Version
+; CHECK-NEXT: .byte   0
+;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
+; CHECK-NEXT: .byte   1
+;; Function Entry PC
+; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+;; Function type ID
+; CHECK-NEXT: .quad   -5212364466660467813
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index f0dbc31..1aabf66 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -15,16 +15,13 @@ declare !type !2 ptr @direct_baz(ptr)
 define ptr @ball() {
 entry:
   call void @direct_foo()
-  %fp_foo_val = load ptr, ptr null, align 8
-   ; CHECK: [[LABEL_TMP0:\.L.*]]:
+  %fp_foo_val = load ptr, ptr null, align 8   
   call void (...) %fp_foo_val(), !callee_type !0   
   call void @direct_foo()
-  %fp_bar_val = load ptr, ptr null, align 8
-  ; CHECK: [[LABEL_TMP1:\.L.*]]:
+  %fp_bar_val = load ptr, ptr null, align 8  
   %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !2  
   %call_fp_bar_direct = call i32 @direct_bar(i8 1)
   %fp_baz_val = load ptr, ptr null, align 8
-  ; CHECK: [[LABEL_TMP2:\.L.*]]:
   %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4
   call void @direct_foo()
   %call_fp_baz_direct = call ptr @direct_baz(ptr null)
@@ -32,29 +29,31 @@ entry:
   ret ptr %call_fp_baz
 }
 
-; CHECK: .section .callgraph,"o",@progbits,.text
-
-; CHECK-NEXT: .quad   0
-; CHECK-NEXT: .quad   [[LABEL_FUNC]]
-; CHECK-NEXT: .quad   1
-; CHECK-NEXT: .quad   3
 !0 = !{!1}
 !1 = !{i64 0, !"_ZTSFvE.generalized"}
-;; Test for MD5 hash of _ZTSFvE.generalized and the generated temporary callsite label.
-; CHECK-NEXT: .quad   4524972987496481828
-; CHECK-NEXT: .quad   [[LABEL_TMP0]]
 !2 = !{!3}
 !3 = !{i64 0, !"_ZTSFicE.generalized"}
-;; Test for MD5 hash of _ZTSFicE.generalized and the generated temporary callsite label.
-; CHECK-NEXT: .quad   3498816979441845844
-; CHECK-NEXT: .quad   [[LABEL_TMP1]]
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
-;; Test for MD5 hash of _ZTSFPvS_E.generalized and the generated temporary callsite label.
-; CHECK-NEXT: .quad   8646233951371320954
-; CHECK-NEXT: .quad   [[LABEL_TMP2]]
-;; Test for number of direct calls and {callsite_label, callee} pairs.
-; CHECK-NEXT: .quad	3
+
+; CHECK: .section .callgraph,"o",@progbits,.text
+;; Version
+; CHECK-NEXT: .byte   0
+;; Flags
+; CHECK-NEXT: .byte   7
+;; Function Entry PC
+; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+;; Function type ID -- set to 0 as no type metadata attached to function.
+; CHECK-NEXT: .quad   0
+;; Number of unique direct callees.
+; CHECK-NEXT: .byte	  3
+;; Direct callees.
 ; CHECK-NEXT: .quad	direct_foo
 ; CHECK-NEXT: .quad	direct_bar
 ; CHECK-NEXT: .quad	direct_baz
+;; Number of unique indirect target type IDs.
+; CHECK-NEXT: .byte   3
+;; Indirect type IDs.
+; CHECK-NEXT: .quad   4524972987496481828
+; CHECK-NEXT: .quad   3498816979441845844
+; CHECK-NEXT: .quad   8646233951371320954
diff --git a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
index fa14a98..34dc5b8 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
@@ -22,13 +22,14 @@ declare !type !2 i32 @foo(i8 signext)
 
 declare !type !2 i32 @bar(i8 signext)
 
-;; Check that the numeric type id (md5 hash) for the below type ids are emitted
-;; to the callgraph section.
-
-; CHECK: Hex dump of section '.callgraph':
-
 !0 = !{i64 0, !"_ZTSFiPvcE.generalized"}
 !1 = !{!2}
-; CHECK-DAG: 5486bc59 814b8e30
 !2 = !{i64 0, !"_ZTSFicE.generalized"}
 !3 = !{i64 0, !"_ZTSFiiE.generalized"}
+
+; CHECK: Hex dump of section '.callgraph':
+; CHECK-NEXT: 0x00000000 00050000 00000000 00008e19 0b7f3326
+; CHECK-NEXT: 0x00000010 e3000154 86bc5981 4b8e3000 05000000
+;; Verify that the type id 0x308e4b8159bc8654 is in section.
+; CHECK-NEXT: 0x00000020 00000000 00a150b8 3e0cfe3c b2015486
+; CHECK-NEXT: 0x00000030 bc59814b 8e30
diff --git a/llvm/test/CodeGen/X86/call-graph-section.ll b/llvm/test/CodeGen/X86/call-graph-section.ll
index 66d009c..c144a24 100644
--- a/llvm/test/CodeGen/X86/call-graph-section.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section.ll
@@ -22,15 +22,16 @@ entry:
 
 ;; Check that the numeric type id (md5 hash) for the below type ids are emitted
 ;; to the callgraph section.
-
-; CHECK: Hex dump of section '.callgraph':
-
-; CHECK-DAG: 2444f731 f5eecb3e
 !0 = !{i64 0, !"_ZTSFvE.generalized"}
 !1 = !{!0}
-; CHECK-DAG: 5486bc59 814b8e30
 !2 = !{i64 0, !"_ZTSFicE.generalized"}
 !3 = !{!2}
-; CHECK-DAG: 7ade6814 f897fd77
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
+
+;; Make sure following type IDs are in call graph section
+;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814
+; CHECK:      Hex dump of section '.callgraph':
+; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000000
+; CHECK-NEXT: 0x00000010 00000324 44f731f5 eecb3e54 86bc5981
+; CHECK-NEXT: 0x00000020 4b8e307a de6814f8 97fd77
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
new file mode 100644
index 0000000..a0c243b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -0,0 +1,43 @@
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; A minimal test case. llc will crash if global variables already has a section
+;; prefix. Subsequent PRs will expand on this test case to test the hotness
+;; reconciliation implementation.
+
+; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+; RUN:     -partition-static-data-sections=true \
+; RUN:     -data-sections=true  -unique-section-names=false \
+; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+
+; ERR: Global variable hot_bss already has a section prefix hot
+
+@hot_bss = internal global i32 0, !section_prefix !17
+
+define void @hot_func() !prof !14 {
+  %9 = load i32, ptr @hot_bss
+  %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9)
+  ret void
+}
+
+declare i32 @func_taking_arbitrary_param(...)
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 1460183}
+!5 = !{!"MaxCount", i64 849024}
+!6 = !{!"MaxInternalCount", i64 32769}
+!7 = !{!"MaxFunctionCount", i64 849024}
+!8 = !{!"NumCounts", i64 23627}
+!9 = !{!"NumFunctions", i64 3271}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13}
+!12 = !{i32 990000, i64 166, i32 73}
+!13 = !{i32 999999, i64 3, i32 1443}
+!14 = !{!"function_entry_count", i64 100000}
+!15 = !{!"function_entry_count", i64 1}
+!16 = !{!"branch_weights", i32 1, i32 99999}
+!17 = !{!"section_prefix", !"hot"}
diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll
new file mode 100644
index 0000000..4ea3101
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr162812.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE42
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
+; SSE2-LABEL: PR162812:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrlw $2, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $2, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    paddb %xmm3, %xmm3
+; SSE2-NEXT:    paddb %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: PR162812:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa %xmm2, %xmm5
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm6
+; SSE42-NEXT:    psllw $2, %xmm6
+; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; SSE42-NEXT:    pand %xmm7, %xmm6
+; SSE42-NEXT:    psrlw $2, %xmm5
+; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224]
+; SSE42-NEXT:    pand %xmm4, %xmm5
+; SSE42-NEXT:    paddb %xmm5, %xmm5
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; SSE42-NEXT:    movdqa %xmm2, %xmm6
+; SSE42-NEXT:    paddb %xmm2, %xmm6
+; SSE42-NEXT:    paddb %xmm5, %xmm5
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; SSE42-NEXT:    movdqa %xmm1, %xmm5
+; SSE42-NEXT:    psllw $2, %xmm5
+; SSE42-NEXT:    pand %xmm7, %xmm5
+; SSE42-NEXT:    psrlw $2, %xmm3
+; SSE42-NEXT:    pand %xmm3, %xmm4
+; SSE42-NEXT:    paddb %xmm4, %xmm4
+; SSE42-NEXT:    movdqa %xmm4, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; SSE42-NEXT:    movdqa %xmm1, %xmm3
+; SSE42-NEXT:    paddb %xmm1, %xmm3
+; SSE42-NEXT:    paddb %xmm4, %xmm4
+; SSE42-NEXT:    movdqa %xmm4, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX2-LABEL: PR162812:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR162812:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512-NEXT:    vpsrlw $2, %ymm1, %ymm1
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = lshr <32 x i8> %mask, splat (i8 7)
+  %ret = shl <32 x i8> %a, %1
+  ret <32 x i8> %ret
+}
diff --git a/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
new file mode 100644
index 0000000..211a7bc
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
@@ -0,0 +1,17 @@
+; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info - | FileCheck %s --implicit-check-not "DW_AT_language"
+
+; CHECK: DW_AT_language_name (DW_LNAME_ObjC_plus_plus)
+
+@x = global i32 0, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!6, !7}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus, file: !3, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!3 = !DIFile(filename: "cu.cpp", directory: "/tmp")
+!4 = !{!0}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{i32 7, !"Dwarf Version", i32 5}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/DebugInfo/Generic/compileunit-source-language.ll b/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
new file mode 100644
index 0000000..bafe620
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
@@ -0,0 +1,17 @@
+; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info - | FileCheck %s --implicit-check-not "DW_AT_language_name"
+
+; CHECK: DW_AT_language (DW_LANG_C)
+
+@x = global i32 0, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!6, !7}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!3 = !DIFile(filename: "cu.cpp", directory: "/tmp")
+!4 = !{!0}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{i32 7, !"Dwarf Version", i32 5}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll
index 5929c15..84c7df1 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll
@@ -1,152 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S %s -passes=atomic-expand -mtriple=x86_64-linux-gnu | FileCheck %s
 
 ; This file tests the functions `llvm::convertAtomicLoadToIntegerType` and
-; `llvm::convertAtomicStoreToIntegerType`. If X86 stops using this 
+; `llvm::convertAtomicStoreToIntegerType`. If X86 stops using this
 ; functionality, please move this test to a target which still is.
 
 define float @float_load_expand(ptr %ptr) {
-; CHECK-LABEL: @float_load_expand
-; CHECK: %1 = load atomic i32, ptr %ptr unordered, align 4
-; CHECK: %2 = bitcast i32 %1 to float
-; CHECK: ret float %2
+; CHECK-LABEL: define float @float_load_expand(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR]] unordered, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    ret float [[TMP2]]
+;
   %res = load atomic float, ptr %ptr unordered, align 4
   ret float %res
 }
 
 define float @float_load_expand_seq_cst(ptr %ptr) {
-; CHECK-LABEL: @float_load_expand_seq_cst
-; CHECK: %1 = load atomic i32, ptr %ptr seq_cst, align 4
-; CHECK: %2 = bitcast i32 %1 to float
-; CHECK: ret float %2
+; CHECK-LABEL: define float @float_load_expand_seq_cst(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    ret float [[TMP2]]
+;
   %res = load atomic float, ptr %ptr seq_cst, align 4
   ret float %res
 }
 
 define float @float_load_expand_vol(ptr %ptr) {
-; CHECK-LABEL: @float_load_expand_vol
-; CHECK: %1 = load atomic volatile i32, ptr %ptr unordered, align 4
-; CHECK: %2 = bitcast i32 %1 to float
-; CHECK: ret float %2
+; CHECK-LABEL: define float @float_load_expand_vol(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic volatile i32, ptr [[PTR]] unordered, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    ret float [[TMP2]]
+;
   %res = load atomic volatile float, ptr %ptr unordered, align 4
   ret float %res
 }
 
 define float @float_load_expand_addr1(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: @float_load_expand_addr1
-; CHECK: %1 = load atomic i32, ptr addrspace(1) %ptr unordered, align 4
-; CHECK: %2 = bitcast i32 %1 to float
-; CHECK: ret float %2
+; CHECK-LABEL: define float @float_load_expand_addr1(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] unordered, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    ret float [[TMP2]]
+;
   %res = load atomic float, ptr addrspace(1) %ptr unordered, align 4
   ret float %res
 }
 
 define void @float_store_expand(ptr %ptr, float %v) {
-; CHECK-LABEL: @float_store_expand
-; CHECK: %1 = bitcast float %v to i32
-; CHECK: store atomic i32 %1, ptr %ptr unordered, align 4
+; CHECK-LABEL: define void @float_store_expand(
+; CHECK-SAME: ptr [[PTR:%.*]], float [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[V]] to i32
+; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr [[PTR]] unordered, align 4
+; CHECK-NEXT:    ret void
+;
   store atomic float %v, ptr %ptr unordered, align 4
   ret void
 }
 
 define void @float_store_expand_seq_cst(ptr %ptr, float %v) {
-; CHECK-LABEL: @float_store_expand_seq_cst
-; CHECK: %1 = bitcast float %v to i32
-; CHECK: store atomic i32 %1, ptr %ptr seq_cst, align 4
+; CHECK-LABEL: define void @float_store_expand_seq_cst(
+; CHECK-SAME: ptr [[PTR:%.*]], float [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[V]] to i32
+; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
   store atomic float %v, ptr %ptr seq_cst, align 4
   ret void
 }
 
 define void @float_store_expand_vol(ptr %ptr, float %v) {
-; CHECK-LABEL: @float_store_expand_vol
-; CHECK: %1 = bitcast float %v to i32
-; CHECK: store atomic volatile i32 %1, ptr %ptr unordered, align 4
+; CHECK-LABEL: define void @float_store_expand_vol(
+; CHECK-SAME: ptr [[PTR:%.*]], float [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[V]] to i32
+; CHECK-NEXT:    store atomic volatile i32 [[TMP1]], ptr [[PTR]] unordered, align 4
+; CHECK-NEXT:    ret void
+;
   store atomic volatile float %v, ptr %ptr unordered, align 4
   ret void
 }
 
 define void @float_store_expand_addr1(ptr addrspace(1) %ptr, float %v) {
-; CHECK-LABEL: @float_store_expand_addr1
-; CHECK: %1 = bitcast float %v to i32
-; CHECK: store atomic i32 %1, ptr addrspace(1) %ptr unordered, align 4
+; CHECK-LABEL: define void @float_store_expand_addr1(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], float [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[V]] to i32
+; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr addrspace(1) [[PTR]] unordered, align 4
+; CHECK-NEXT:    ret void
+;
   store atomic float %v, ptr addrspace(1) %ptr unordered, align 4
   ret void
 }
 
 define void @pointer_cmpxchg_expand(ptr %ptr, ptr %v) {
-; CHECK-LABEL: @pointer_cmpxchg_expand
-; CHECK: %1 = ptrtoint ptr %v to i64
-; CHECK: %2 = cmpxchg ptr %ptr, i64 0, i64 %1 seq_cst monotonic
-; CHECK: %3 = extractvalue { i64, i1 } %2, 0
-; CHECK: %4 = extractvalue { i64, i1 } %2, 1
-; CHECK: %5 = inttoptr i64 %3 to ptr
-; CHECK: %6 = insertvalue { ptr, i1 } poison, ptr %5, 0
-; CHECK: %7 = insertvalue { ptr, i1 } %6, i1 %4, 1
+; CHECK-LABEL: define void @pointer_cmpxchg_expand(
+; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst monotonic, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    ret void
+;
   cmpxchg ptr %ptr, ptr null, ptr %v seq_cst monotonic
   ret void
 }
 
 define void @pointer_cmpxchg_expand2(ptr %ptr, ptr %v) {
-; CHECK-LABEL: @pointer_cmpxchg_expand2
-; CHECK: %1 = ptrtoint ptr %v to i64
-; CHECK: %2 = cmpxchg ptr %ptr, i64 0, i64 %1 release monotonic
-; CHECK: %3 = extractvalue { i64, i1 } %2, 0
-; CHECK: %4 = extractvalue { i64, i1 } %2, 1
-; CHECK: %5 = inttoptr i64 %3 to ptr
-; CHECK: %6 = insertvalue { ptr, i1 } poison, ptr %5, 0
-; CHECK: %7 = insertvalue { ptr, i1 } %6, i1 %4, 1
+; CHECK-LABEL: define void @pointer_cmpxchg_expand2(
+; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] release monotonic, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    ret void
+;
   cmpxchg ptr %ptr, ptr null, ptr %v release monotonic
   ret void
 }
 
 define void @pointer_cmpxchg_expand3(ptr %ptr, ptr %v) {
-; CHECK-LABEL: @pointer_cmpxchg_expand3
-; CHECK: %1 = ptrtoint ptr %v to i64
-; CHECK: %2 = cmpxchg ptr %ptr, i64 0, i64 %1 seq_cst seq_cst
-; CHECK: %3 = extractvalue { i64, i1 } %2, 0
-; CHECK: %4 = extractvalue { i64, i1 } %2, 1
-; CHECK: %5 = inttoptr i64 %3 to ptr
-; CHECK: %6 = insertvalue { ptr, i1 } poison, ptr %5, 0
-; CHECK: %7 = insertvalue { ptr, i1 } %6, i1 %4, 1
+; CHECK-LABEL: define void @pointer_cmpxchg_expand3(
+; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    ret void
+;
   cmpxchg ptr %ptr, ptr null, ptr %v seq_cst seq_cst
   ret void
 }
 
 define void @pointer_cmpxchg_expand4(ptr %ptr, ptr %v) {
-; CHECK-LABEL: @pointer_cmpxchg_expand4
-; CHECK: %1 = ptrtoint ptr %v to i64
-; CHECK: %2 = cmpxchg weak ptr %ptr, i64 0, i64 %1 seq_cst seq_cst
-; CHECK: %3 = extractvalue { i64, i1 } %2, 0
-; CHECK: %4 = extractvalue { i64, i1 } %2, 1
-; CHECK: %5 = inttoptr i64 %3 to ptr
-; CHECK: %6 = insertvalue { ptr, i1 } poison, ptr %5, 0
-; CHECK: %7 = insertvalue { ptr, i1 } %6, i1 %4, 1
+; CHECK-LABEL: define void @pointer_cmpxchg_expand4(
+; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg weak ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    ret void
+;
   cmpxchg weak ptr %ptr, ptr null, ptr %v seq_cst seq_cst
   ret void
 }
 
 define void @pointer_cmpxchg_expand5(ptr %ptr, ptr %v) {
-; CHECK-LABEL: @pointer_cmpxchg_expand5
-; CHECK: %1 = ptrtoint ptr %v to i64
-; CHECK: %2 = cmpxchg volatile ptr %ptr, i64 0, i64 %1 seq_cst seq_cst
-; CHECK: %3 = extractvalue { i64, i1 } %2, 0
-; CHECK: %4 = extractvalue { i64, i1 } %2, 1
-; CHECK: %5 = inttoptr i64 %3 to ptr
-; CHECK: %6 = insertvalue { ptr, i1 } poison, ptr %5, 0
-; CHECK: %7 = insertvalue { ptr, i1 } %6, i1 %4, 1
+; CHECK-LABEL: define void @pointer_cmpxchg_expand5(
+; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg volatile ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    ret void
+;
   cmpxchg volatile ptr %ptr, ptr null, ptr %v seq_cst seq_cst
   ret void
 }
 
-define void @pointer_cmpxchg_expand6(ptr addrspace(1) %ptr, 
-                                     ptr addrspace(2) %v) {
-; CHECK-LABEL: @pointer_cmpxchg_expand6
-; CHECK: %1 = ptrtoint ptr addrspace(2) %v to i64
-; CHECK: %2 = cmpxchg ptr addrspace(1) %ptr, i64 0, i64 %1 seq_cst seq_cst
-; CHECK: %3 = extractvalue { i64, i1 } %2, 0
-; CHECK: %4 = extractvalue { i64, i1 } %2, 1
-; CHECK: %5 = inttoptr i64 %3 to ptr addrspace(2)
-; CHECK: %6 = insertvalue { ptr addrspace(2), i1 } poison, ptr addrspace(2) %5, 0
-; CHECK: %7 = insertvalue { ptr addrspace(2), i1 } %6, i1 %4, 1
+define void @pointer_cmpxchg_expand6(ptr addrspace(1) %ptr, ptr addrspace(2) %v) {
+; CHECK-LABEL: define void @pointer_cmpxchg_expand6(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(2) [[V]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(2)
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { ptr addrspace(2), i1 } poison, ptr addrspace(2) [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr addrspace(2), i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    ret void
+;
   cmpxchg ptr addrspace(1) %ptr, ptr addrspace(2) null, ptr addrspace(2) %v seq_cst seq_cst
   ret void
 }
diff --git a/llvm/test/Transforms/GVN/PRE/no-pre-load-for-token-like.ll b/llvm/test/Transforms/GVN/PRE/no-pre-load-for-token-like.ll
new file mode 100644
index 0000000..1b36aba
--- /dev/null
+++ b/llvm/test/Transforms/GVN/PRE/no-pre-load-for-token-like.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=gvn %s | FileCheck %s
+
+; NOTE: A test to confirm GVN doesn't collapse loads of token-like types into
+; NOTE: phi nodes.
+
+define ptr @main() {
+; CHECK-LABEL: define ptr @main() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[ENTRY_IF_END_I_CRIT_EDGE:.*]], label %[[IF_THEN_I:.*]]
+; CHECK:       [[ENTRY_IF_END_I_CRIT_EDGE]]:
+; CHECK-NEXT:    br label %[[IF_END_I:.*]]
+; CHECK:       [[IF_THEN_I]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load target("dx.RawBuffer", half, 1, 0), ptr null, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f16_1_0t(target("dx.RawBuffer", half, 1, 0) [[TMP0]], i32 0)
+; CHECK-NEXT:    br label %[[IF_END_I]]
+; CHECK:       [[IF_END_I]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = load target("dx.RawBuffer", half, 1, 0), ptr null, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f16_1_0t(target("dx.RawBuffer", half, 1, 0) [[TMP2]], i32 0)
+; CHECK-NEXT:    ret ptr [[TMP3]]
+;
+entry:
+  br i1 false, label %if.end.i, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  %0 = load target("dx.RawBuffer", half, 1, 0), ptr null, align 4
+  %1 = tail call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f16_1_0t(target("dx.RawBuffer", half, 1, 0) %0, i32 0)
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %entry
+  %2 = load target("dx.RawBuffer", half, 1, 0), ptr null, align 4
+  %3 = tail call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f16_1_0t(target("dx.RawBuffer", half, 1, 0) %2, i32 0)
+  ret ptr %3
+}
diff --git a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
index ff6d9aa..1ba7005 100644
--- a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
+++ b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
@@ -481,7 +481,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan ninf i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
@@ -509,7 +509,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan ninf i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
@@ -558,7 +558,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_nsz_inverted(f
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan ninf i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
@@ -571,7 +571,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float %
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan ninf i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
@@ -585,7 +585,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero(float %x, float nofpclass(nzero) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan ninf i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
@@ -598,7 +598,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_ne
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub(float %x, float nofpclass(nzero nsub) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan ninf i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
@@ -705,7 +705,7 @@ define float @fmul_by_self_if_0_oeq_zero_f32(float %x) {
 define float @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x) {
 ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan ninf i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
index 253bc9e7..c14dd46 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
@@ -23,6 +23,50 @@ define float @select_fpclass_fadd(i1 %cond, float nofpclass(nan) %A, float %B) {
   ret float %D
 }
 
+define float @select_fpclass_fadd_ninf1(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fadd_ninf1(
+; CHECK-NEXT:    [[C:%.*]] = select ninf i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd ninf float %A, %B
+  %D = select i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fadd_ninf2(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fadd_ninf2(
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd float %A, %B
+  %D = select ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fadd_ninf3(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fadd_ninf3(
+; CHECK-NEXT:    [[C:%.*]] = select ninf i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fadd ninf float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd ninf float %A, %B
+  %D = select ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fadd_nnan_ninf(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fadd_nnan_ninf(
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd float %A, %B
+  %D = select nnan ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
 define float @select_nnan_fadd(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fadd(
 ; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
@@ -47,7 +91,7 @@ define float @select_nnan_fadd_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fadd_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fadd reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
@@ -58,7 +102,7 @@ define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fadd_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fadd_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fadd reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
@@ -124,7 +168,7 @@ define float @select_nnan_fmul_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fmul_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fmul_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fmul reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
@@ -135,7 +179,7 @@ define float @select_nnan_fmul_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fmul_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fmul reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
@@ -144,6 +188,50 @@ define float @select_nnan_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
   ret float %D
 }
 
+define float @select_fpclass_fmul_ninf1(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fmul_ninf1(
+; CHECK-NEXT:    [[C:%.*]] = select ninf i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fmul float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fmul ninf float %A, %B
+  %D = select i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fmul_ninf2(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fmul_ninf2(
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fmul float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fmul float %A, %B
+  %D = select ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fmul_ninf3(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fmul_ninf3(
+; CHECK-NEXT:    [[C:%.*]] = select ninf i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fmul ninf float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fmul ninf float %A, %B
+  %D = select ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fmul_nnan_ninf(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fmul_nnan_ninf(
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fmul float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fmul float %A, %B
+  %D = select nnan ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
 define float @select_nnan_fsub(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fsub(
 ; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
@@ -168,7 +256,7 @@ define float @select_nnan_fsub_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fsub_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fsub_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fsub reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
@@ -179,7 +267,7 @@ define float @select_nnan_fsub_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fsub_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fsub_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fsub reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
@@ -188,6 +276,50 @@ define float @select_nnan_fsub_swapped_fast_math(i1 %cond, float %A, float %B) {
   ret float %D
 }
 
+define float @select_fpclass_fsub_ninf1(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fsub_ninf1(
+; CHECK-NEXT:    [[C:%.*]] = select ninf i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fsub ninf float %A, %B
+  %D = select i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fsub_ninf2(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fsub_ninf2(
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fsub float %A, %B
+  %D = select ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fsub_ninf3(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fsub_ninf3(
+; CHECK-NEXT:    [[C:%.*]] = select ninf i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fsub ninf float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fsub ninf float %A, %B
+  %D = select ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fsub_nnan_ninf(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fsub_nnan_ninf(
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fsub float %A, %B
+  %D = select nnan ninf i1 %cond, float %C, float %A
+  ret float %D
+}
+
 define <4 x float> @select_nnan_nsz_fsub_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
 ; CHECK-LABEL: @select_nnan_nsz_fsub_v4f32(
 ; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer
@@ -246,7 +378,7 @@ define float @select_nnan_fdiv_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fdiv_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fdiv_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fdiv reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
@@ -257,7 +389,7 @@ define float @select_nnan_fdiv_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_nnan_fdiv_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_nnan_fdiv_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select nnan ninf i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fdiv reassoc nnan arcp contract afn float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index e154883c..9dbbf4c 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -45,7 +45,7 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %
 ; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = fdiv fast <4 x float> [[TMP8]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP20]], <4 x float> [[TMP9]], <4 x float> splat (float -0.000000e+00)
+; CHECK-NEXT:    [[TMP10:%.*]] = select ninf <4 x i1> [[TMP20]], <4 x float> [[TMP9]], <4 x float> splat (float -0.000000e+00)
 ; CHECK-NEXT:    [[PREDPHI]] = fadd reassoc arcp contract afn <4 x float> [[VEC_PHI]], [[TMP10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
index e8709a5..55adda7 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
@@ -41,12 +41,12 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt <4 x double> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x double> [[TMP7]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00)
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP12:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP13:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00)
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP16:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP17:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI15]], [[TMP17]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
@@ -75,9 +75,9 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast ogt double [[SUB]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double [[SUB]], [[SUB]]
-; CHECK-NEXT:    [[ADD8:%.*]] = select i1 [[CMP1]], double [[SUB]], double -0.000000e+00
+; CHECK-NEXT:    [[ADD8:%.*]] = select ninf i1 [[CMP1]], double [[SUB]], double -0.000000e+00
 ; CHECK-NEXT:    [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]]
-; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[CMP1]], double [[MUL3]], double -0.000000e+00
+; CHECK-NEXT:    [[ADD4:%.*]] = select ninf i1 [[CMP1]], double [[MUL3]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -229,12 +229,12 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <4 x double> [[TMP10]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x double> [[TMP11]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00)
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP16:%.*]] = select ninf <4 x i1> [[TMP12]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP17:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI32]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI33]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP14]], <4 x double> splat (double -0.000000e+00)
-; CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP20:%.*]] = select ninf <4 x i1> [[TMP12]], <4 x double> [[TMP14]], <4 x double> splat (double -0.000000e+00)
+; CHECK-NEXT:    [[TMP21:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP22]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI31]], [[TMP21]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
@@ -263,9 +263,9 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[SUB_US:%.*]] = fsub fast double [[MUL_US]], [[Z]]
 ; CHECK-NEXT:    [[CMP4_US:%.*]] = fcmp fast ogt double [[SUB_US]], 0.000000e+00
 ; CHECK-NEXT:    [[ADD7_US:%.*]] = fmul fast double [[SUB_US]], [[SUB_US]]
-; CHECK-NEXT:    [[ADD12_US:%.*]] = select i1 [[CMP4_US]], double [[SUB_US]], double -0.000000e+00
+; CHECK-NEXT:    [[ADD12_US:%.*]] = select ninf i1 [[CMP4_US]], double [[SUB_US]], double -0.000000e+00
 ; CHECK-NEXT:    [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]]
-; CHECK-NEXT:    [[ADD7_US1:%.*]] = select i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00
+; CHECK-NEXT:    [[ADD7_US1:%.*]] = select ninf i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
new file mode 100644
index 0000000..c1042f18
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
@@ -0,0 +1,526 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \
+; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s
+; Function Attrs: nounwind uwtable vscale_range(8,1024)
+define i32 @x264_pixel_satd_8x4(ptr %pix1, i32  %i_pix1, ptr  %pix2, i32  %i_pix2) {
+; CHECK-LABEL: define i32 @x264_pixel_satd_8x4(
+; CHECK-SAME: ptr [[PIX1:%.*]], i32 [[I_PIX1:%.*]], ptr [[PIX2:%.*]], i32 [[I_PIX2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64
+; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PIX1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
+; CHECK-NEXT:    [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16)
+; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 20, i32 21, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 29, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr <16 x i32> [[TMP67]], splat (i32 15)
+; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i32> [[TMP68]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP70:%.*]] = mul nuw <16 x i32> [[TMP69]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP71:%.*]] = add <16 x i32> [[TMP70]], [[TMP67]]
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <16 x i32> [[TMP71]], [[TMP70]]
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP72]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP73]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP73]], 16
+; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
+; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
+; CHECK-NEXT:    ret i32 [[SHR120]]
+;
+entry:
+  %idx.ext = sext i32 %i_pix1 to i64
+  %idx.ext63 = sext i32 %i_pix2 to i64
+  %0 = load i8, ptr %pix1, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, ptr %pix2, align 1
+  %conv2 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx3 = getelementptr inbounds nuw i8, ptr %pix1, i64 4
+  %2 = load i8, ptr %arrayidx3, align 1
+  %conv4 = zext i8 %2 to i32
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %pix2, i64 4
+  %3 = load i8, ptr %arrayidx5, align 1
+  %conv6 = zext i8 %3 to i32
+  %sub7 = sub nsw i32 %conv4, %conv6
+  %shl = shl nsw i32 %sub7, 16
+  %add = add nsw i32 %shl, %sub
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %pix1, i64 1
+  %4 = load i8, ptr %arrayidx8, align 1
+  %conv9 = zext i8 %4 to i32
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %pix2, i64 1
+  %5 = load i8, ptr %arrayidx10, align 1
+  %conv11 = zext i8 %5 to i32
+  %sub12 = sub nsw i32 %conv9, %conv11
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %pix1, i64 5
+  %6 = load i8, ptr %arrayidx13, align 1
+  %conv14 = zext i8 %6 to i32
+  %arrayidx15 = getelementptr inbounds nuw i8, ptr %pix2, i64 5
+  %7 = load i8, ptr %arrayidx15, align 1
+  %conv16 = zext i8 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %shl18 = shl nsw i32 %sub17, 16
+  %add19 = add nsw i32 %shl18, %sub12
+  %arrayidx20 = getelementptr inbounds nuw i8, ptr %pix1, i64 2
+  %8 = load i8, ptr %arrayidx20, align 1
+  %conv21 = zext i8 %8 to i32
+  %arrayidx22 = getelementptr inbounds nuw i8, ptr %pix2, i64 2
+  %9 = load i8, ptr %arrayidx22, align 1
+  %conv23 = zext i8 %9 to i32
+  %sub24 = sub nsw i32 %conv21, %conv23
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %pix1, i64 6
+  %10 = load i8, ptr %arrayidx25, align 1
+  %conv26 = zext i8 %10 to i32
+  %arrayidx27 = getelementptr inbounds nuw i8, ptr %pix2, i64 6
+  %11 = load i8, ptr %arrayidx27, align 1
+  %conv28 = zext i8 %11 to i32
+  %sub29 = sub nsw i32 %conv26, %conv28
+  %shl30 = shl nsw i32 %sub29, 16
+  %add31 = add nsw i32 %shl30, %sub24
+  %arrayidx32 = getelementptr inbounds nuw i8, ptr %pix1, i64 3
+  %12 = load i8, ptr %arrayidx32, align 1
+  %conv33 = zext i8 %12 to i32
+  %arrayidx34 = getelementptr inbounds nuw i8, ptr %pix2, i64 3
+  %13 = load i8, ptr %arrayidx34, align 1
+  %conv35 = zext i8 %13 to i32
+  %sub36 = sub nsw i32 %conv33, %conv35
+  %arrayidx37 = getelementptr inbounds nuw i8, ptr %pix1, i64 7
+  %14 = load i8, ptr %arrayidx37, align 1
+  %conv38 = zext i8 %14 to i32
+  %arrayidx39 = getelementptr inbounds nuw i8, ptr %pix2, i64 7
+  %15 = load i8, ptr %arrayidx39, align 1
+  %conv40 = zext i8 %15 to i32
+  %sub41 = sub nsw i32 %conv38, %conv40
+  %shl42 = shl nsw i32 %sub41, 16
+  %add43 = add nsw i32 %shl42, %sub36
+  %add44 = add nsw i32 %add19, %add
+  %sub45 = sub nsw i32 %add, %add19
+  %add46 = add nsw i32 %add43, %add31
+  %sub47 = sub nsw i32 %add31, %add43
+  %add48 = add nsw i32 %add46, %add44
+  %sub51 = sub nsw i32 %add44, %add46
+  %add55 = add nsw i32 %sub47, %sub45
+  %sub59 = sub nsw i32 %sub45, %sub47
+  %add.ptr = getelementptr inbounds i8, ptr %pix1, i64 %idx.ext
+  %add.ptr64 = getelementptr inbounds i8, ptr %pix2, i64 %idx.ext63
+  %16 = load i8, ptr %add.ptr, align 1
+  %conv.1 = zext i8 %16 to i32
+  %17 = load i8, ptr %add.ptr64, align 1
+  %conv2.1 = zext i8 %17 to i32
+  %sub.1 = sub nsw i32 %conv.1, %conv2.1
+  %arrayidx3.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 4
+  %18 = load i8, ptr %arrayidx3.1, align 1
+  %conv4.1 = zext i8 %18 to i32
+  %arrayidx5.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 4
+  %19 = load i8, ptr %arrayidx5.1, align 1
+  %conv6.1 = zext i8 %19 to i32
+  %sub7.1 = sub nsw i32 %conv4.1, %conv6.1
+  %shl.1 = shl nsw i32 %sub7.1, 16
+  %add.1 = add nsw i32 %shl.1, %sub.1
+  %arrayidx8.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 1
+  %20 = load i8, ptr %arrayidx8.1, align 1
+  %conv9.1 = zext i8 %20 to i32
+  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 1
+  %21 = load i8, ptr %arrayidx10.1, align 1
+  %conv11.1 = zext i8 %21 to i32
+  %sub12.1 = sub nsw i32 %conv9.1, %conv11.1
+  %arrayidx13.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 5
+  %22 = load i8, ptr %arrayidx13.1, align 1
+  %conv14.1 = zext i8 %22 to i32
+  %arrayidx15.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 5
+  %23 = load i8, ptr %arrayidx15.1, align 1
+  %conv16.1 = zext i8 %23 to i32
+  %sub17.1 = sub nsw i32 %conv14.1, %conv16.1
+  %shl18.1 = shl nsw i32 %sub17.1, 16
+  %add19.1 = add nsw i32 %shl18.1, %sub12.1
+  %arrayidx20.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 2
+  %24 = load i8, ptr %arrayidx20.1, align 1
+  %conv21.1 = zext i8 %24 to i32
+  %arrayidx22.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 2
+  %25 = load i8, ptr %arrayidx22.1, align 1
+  %conv23.1 = zext i8 %25 to i32
+  %sub24.1 = sub nsw i32 %conv21.1, %conv23.1
+  %arrayidx25.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 6
+  %26 = load i8, ptr %arrayidx25.1, align 1
+  %conv26.1 = zext i8 %26 to i32
+  %arrayidx27.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 6
+  %27 = load i8, ptr %arrayidx27.1, align 1
+  %conv28.1 = zext i8 %27 to i32
+  %sub29.1 = sub nsw i32 %conv26.1, %conv28.1
+  %shl30.1 = shl nsw i32 %sub29.1, 16
+  %add31.1 = add nsw i32 %shl30.1, %sub24.1
+  %arrayidx32.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 3
+  %28 = load i8, ptr %arrayidx32.1, align 1
+  %conv33.1 = zext i8 %28 to i32
+  %arrayidx34.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 3
+  %29 = load i8, ptr %arrayidx34.1, align 1
+  %conv35.1 = zext i8 %29 to i32
+  %sub36.1 = sub nsw i32 %conv33.1, %conv35.1
+  %arrayidx37.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 7
+  %30 = load i8, ptr %arrayidx37.1, align 1
+  %conv38.1 = zext i8 %30 to i32
+  %arrayidx39.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 7
+  %31 = load i8, ptr %arrayidx39.1, align 1
+  %conv40.1 = zext i8 %31 to i32
+  %sub41.1 = sub nsw i32 %conv38.1, %conv40.1
+  %shl42.1 = shl nsw i32 %sub41.1, 16
+  %add43.1 = add nsw i32 %shl42.1, %sub36.1
+  %add44.1 = add nsw i32 %add19.1, %add.1
+  %sub45.1 = sub nsw i32 %add.1, %add19.1
+  %add46.1 = add nsw i32 %add43.1, %add31.1
+  %sub47.1 = sub nsw i32 %add31.1, %add43.1
+  %add48.1 = add nsw i32 %add46.1, %add44.1
+  %sub51.1 = sub nsw i32 %add44.1, %add46.1
+  %add55.1 = add nsw i32 %sub47.1, %sub45.1
+  %sub59.1 = sub nsw i32 %sub45.1, %sub47.1
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
+  %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63
+  %32 = load i8, ptr %add.ptr.1, align 1
+  %conv.2 = zext i8 %32 to i32
+  %33 = load i8, ptr %add.ptr64.1, align 1
+  %conv2.2 = zext i8 %33 to i32
+  %sub.2 = sub nsw i32 %conv.2, %conv2.2
+  %arrayidx3.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 4
+  %34 = load i8, ptr %arrayidx3.2, align 1
+  %conv4.2 = zext i8 %34 to i32
+  %arrayidx5.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 4
+  %35 = load i8, ptr %arrayidx5.2, align 1
+  %conv6.2 = zext i8 %35 to i32
+  %sub7.2 = sub nsw i32 %conv4.2, %conv6.2
+  %shl.2 = shl nsw i32 %sub7.2, 16
+  %add.2 = add nsw i32 %shl.2, %sub.2
+  %arrayidx8.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 1
+  %36 = load i8, ptr %arrayidx8.2, align 1
+  %conv9.2 = zext i8 %36 to i32
+  %arrayidx10.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 1
+  %37 = load i8, ptr %arrayidx10.2, align 1
+  %conv11.2 = zext i8 %37 to i32
+  %sub12.2 = sub nsw i32 %conv9.2, %conv11.2
+  %arrayidx13.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 5
+  %38 = load i8, ptr %arrayidx13.2, align 1
+  %conv14.2 = zext i8 %38 to i32
+  %arrayidx15.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 5
+  %39 = load i8, ptr %arrayidx15.2, align 1
+  %conv16.2 = zext i8 %39 to i32
+  %sub17.2 = sub nsw i32 %conv14.2, %conv16.2
+  %shl18.2 = shl nsw i32 %sub17.2, 16
+  %add19.2 = add nsw i32 %shl18.2, %sub12.2
+  %arrayidx20.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 2
+  %40 = load i8, ptr %arrayidx20.2, align 1
+  %conv21.2 = zext i8 %40 to i32
+  %arrayidx22.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 2
+  %41 = load i8, ptr %arrayidx22.2, align 1
+  %conv23.2 = zext i8 %41 to i32
+  %sub24.2 = sub nsw i32 %conv21.2, %conv23.2
+  %arrayidx25.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 6
+  %42 = load i8, ptr %arrayidx25.2, align 1
+  %conv26.2 = zext i8 %42 to i32
+  %arrayidx27.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 6
+  %43 = load i8, ptr %arrayidx27.2, align 1
+  %conv28.2 = zext i8 %43 to i32
+  %sub29.2 = sub nsw i32 %conv26.2, %conv28.2
+  %shl30.2 = shl nsw i32 %sub29.2, 16
+  %add31.2 = add nsw i32 %shl30.2, %sub24.2
+  %arrayidx32.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 3
+  %44 = load i8, ptr %arrayidx32.2, align 1
+  %conv33.2 = zext i8 %44 to i32
+  %arrayidx34.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 3
+  %45 = load i8, ptr %arrayidx34.2, align 1
+  %conv35.2 = zext i8 %45 to i32
+  %sub36.2 = sub nsw i32 %conv33.2, %conv35.2
+  %arrayidx37.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 7
+  %46 = load i8, ptr %arrayidx37.2, align 1
+  %conv38.2 = zext i8 %46 to i32
+  %arrayidx39.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 7
+  %47 = load i8, ptr %arrayidx39.2, align 1
+  %conv40.2 = zext i8 %47 to i32
+  %sub41.2 = sub nsw i32 %conv38.2, %conv40.2
+  %shl42.2 = shl nsw i32 %sub41.2, 16
+  %add43.2 = add nsw i32 %shl42.2, %sub36.2
+  %add44.2 = add nsw i32 %add19.2, %add.2
+  %sub45.2 = sub nsw i32 %add.2, %add19.2
+  %add46.2 = add nsw i32 %add43.2, %add31.2
+  %sub47.2 = sub nsw i32 %add31.2, %add43.2
+  %add48.2 = add nsw i32 %add46.2, %add44.2
+  %sub51.2 = sub nsw i32 %add44.2, %add46.2
+  %add55.2 = add nsw i32 %sub47.2, %sub45.2
+  %sub59.2 = sub nsw i32 %sub45.2, %sub47.2
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
+  %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63
+  %48 = load i8, ptr %add.ptr.2, align 1
+  %conv.3 = zext i8 %48 to i32
+  %49 = load i8, ptr %add.ptr64.2, align 1
+  %conv2.3 = zext i8 %49 to i32
+  %sub.3 = sub nsw i32 %conv.3, %conv2.3
+  %arrayidx3.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 4
+  %50 = load i8, ptr %arrayidx3.3, align 1
+  %conv4.3 = zext i8 %50 to i32
+  %arrayidx5.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 4
+  %51 = load i8, ptr %arrayidx5.3, align 1
+  %conv6.3 = zext i8 %51 to i32
+  %sub7.3 = sub nsw i32 %conv4.3, %conv6.3
+  %shl.3 = shl nsw i32 %sub7.3, 16
+  %add.3 = add nsw i32 %shl.3, %sub.3
+  %arrayidx8.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 1
+  %52 = load i8, ptr %arrayidx8.3, align 1
+  %conv9.3 = zext i8 %52 to i32
+  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 1
+  %53 = load i8, ptr %arrayidx10.3, align 1
+  %conv11.3 = zext i8 %53 to i32
+  %sub12.3 = sub nsw i32 %conv9.3, %conv11.3
+  %arrayidx13.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 5
+  %54 = load i8, ptr %arrayidx13.3, align 1
+  %conv14.3 = zext i8 %54 to i32
+  %arrayidx15.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 5
+  %55 = load i8, ptr %arrayidx15.3, align 1
+  %conv16.3 = zext i8 %55 to i32
+  %sub17.3 = sub nsw i32 %conv14.3, %conv16.3
+  %shl18.3 = shl nsw i32 %sub17.3, 16
+  %add19.3 = add nsw i32 %shl18.3, %sub12.3
+  %arrayidx20.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 2
+  %56 = load i8, ptr %arrayidx20.3, align 1
+  %conv21.3 = zext i8 %56 to i32
+  %arrayidx22.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 2
+  %57 = load i8, ptr %arrayidx22.3, align 1
+  %conv23.3 = zext i8 %57 to i32
+  %sub24.3 = sub nsw i32 %conv21.3, %conv23.3
+  %arrayidx25.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 6
+  %58 = load i8, ptr %arrayidx25.3, align 1
+  %conv26.3 = zext i8 %58 to i32
+  %arrayidx27.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 6
+  %59 = load i8, ptr %arrayidx27.3, align 1
+  %conv28.3 = zext i8 %59 to i32
+  %sub29.3 = sub nsw i32 %conv26.3, %conv28.3
+  %shl30.3 = shl nsw i32 %sub29.3, 16
+  %add31.3 = add nsw i32 %shl30.3, %sub24.3
+  %arrayidx32.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 3
+  %60 = load i8, ptr %arrayidx32.3, align 1
+  %conv33.3 = zext i8 %60 to i32
+  %arrayidx34.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 3
+  %61 = load i8, ptr %arrayidx34.3, align 1
+  %conv35.3 = zext i8 %61 to i32
+  %sub36.3 = sub nsw i32 %conv33.3, %conv35.3
+  %arrayidx37.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 7
+  %62 = load i8, ptr %arrayidx37.3, align 1
+  %conv38.3 = zext i8 %62 to i32
+  %arrayidx39.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 7
+  %63 = load i8, ptr %arrayidx39.3, align 1
+  %conv40.3 = zext i8 %63 to i32
+  %sub41.3 = sub nsw i32 %conv38.3, %conv40.3
+  %shl42.3 = shl nsw i32 %sub41.3, 16
+  %add43.3 = add nsw i32 %shl42.3, %sub36.3
+  %add44.3 = add nsw i32 %add19.3, %add.3
+  %sub45.3 = sub nsw i32 %add.3, %add19.3
+  %add46.3 = add nsw i32 %add43.3, %add31.3
+  %sub47.3 = sub nsw i32 %add31.3, %add43.3
+  %add48.3 = add nsw i32 %add46.3, %add44.3
+  %sub51.3 = sub nsw i32 %add44.3, %add46.3
+  %add55.3 = add nsw i32 %sub47.3, %sub45.3
+  %sub59.3 = sub nsw i32 %sub45.3, %sub47.3
+  %add78 = add nsw i32 %add48.1, %add48
+  %sub86 = sub nsw i32 %add48, %add48.1
+  %add94 = add nsw i32 %add48.3, %add48.2
+  %sub102 = sub nsw i32 %add48.2, %add48.3
+  %add103 = add nsw i32 %add94, %add78
+  %sub104 = sub nsw i32 %add78, %add94
+  %add105 = add nsw i32 %sub102, %sub86
+  %sub106 = sub nsw i32 %sub86, %sub102
+  %shr.i = lshr i32 %add103, 15
+  %and.i = and i32 %shr.i, 65537
+  %mul.i = mul nuw i32 %and.i, 65535
+  %add.i = add i32 %mul.i, %add103
+  %xor.i = xor i32 %add.i, %mul.i
+  %shr.i169 = lshr i32 %add105, 15
+  %and.i170 = and i32 %shr.i169, 65537
+  %mul.i171 = mul nuw i32 %and.i170, 65535
+  %add.i172 = add i32 %mul.i171, %add105
+  %xor.i173 = xor i32 %add.i172, %mul.i171
+  %shr.i174 = lshr i32 %sub104, 15
+  %and.i175 = and i32 %shr.i174, 65537
+  %mul.i176 = mul nuw i32 %and.i175, 65535
+  %add.i177 = add i32 %mul.i176, %sub104
+  %xor.i178 = xor i32 %add.i177, %mul.i176
+  %shr.i179 = lshr i32 %sub106, 15
+  %and.i180 = and i32 %shr.i179, 65537
+  %mul.i181 = mul nuw i32 %and.i180, 65535
+  %add.i182 = add i32 %mul.i181, %sub106
+  %xor.i183 = xor i32 %add.i182, %mul.i181
+  %add110 = add i32 %xor.i173, %xor.i
+  %add112 = add i32 %add110, %xor.i178
+  %add113 = add i32 %add112, %xor.i183
+  %add78.1 = add nsw i32 %add55.1, %add55
+  %sub86.1 = sub nsw i32 %add55, %add55.1
+  %add94.1 = add nsw i32 %add55.3, %add55.2
+  %sub102.1 = sub nsw i32 %add55.2, %add55.3
+  %add103.1 = add nsw i32 %add94.1, %add78.1
+  %sub104.1 = sub nsw i32 %add78.1, %add94.1
+  %add105.1 = add nsw i32 %sub102.1, %sub86.1
+  %sub106.1 = sub nsw i32 %sub86.1, %sub102.1
+  %shr.i.1 = lshr i32 %add103.1, 15
+  %and.i.1 = and i32 %shr.i.1, 65537
+  %mul.i.1 = mul nuw i32 %and.i.1, 65535
+  %add.i.1 = add i32 %mul.i.1, %add103.1
+  %xor.i.1 = xor i32 %add.i.1, %mul.i.1
+  %shr.i169.1 = lshr i32 %add105.1, 15
+  %and.i170.1 = and i32 %shr.i169.1, 65537
+  %mul.i171.1 = mul nuw i32 %and.i170.1, 65535
+  %add.i172.1 = add i32 %mul.i171.1, %add105.1
+  %xor.i173.1 = xor i32 %add.i172.1, %mul.i171.1
+  %shr.i174.1 = lshr i32 %sub104.1, 15
+  %and.i175.1 = and i32 %shr.i174.1, 65537
+  %mul.i176.1 = mul nuw i32 %and.i175.1, 65535
+  %add.i177.1 = add i32 %mul.i176.1, %sub104.1
+  %xor.i178.1 = xor i32 %add.i177.1, %mul.i176.1
+  %shr.i179.1 = lshr i32 %sub106.1, 15
+  %and.i180.1 = and i32 %shr.i179.1, 65537
+  %mul.i181.1 = mul nuw i32 %and.i180.1, 65535
+  %add.i182.1 = add i32 %mul.i181.1, %sub106.1
+  %xor.i183.1 = xor i32 %add.i182.1, %mul.i181.1
+  %add108.1 = add i32 %xor.i173.1, %add113
+  %add110.1 = add i32 %add108.1, %xor.i.1
+  %add112.1 = add i32 %add110.1, %xor.i178.1
+  %add113.1 = add i32 %add112.1, %xor.i183.1
+  %add78.2 = add nsw i32 %sub51.1, %sub51
+  %sub86.2 = sub nsw i32 %sub51, %sub51.1
+  %add94.2 = add nsw i32 %sub51.3, %sub51.2
+  %sub102.2 = sub nsw i32 %sub51.2, %sub51.3
+  %add103.2 = add nsw i32 %add94.2, %add78.2
+  %sub104.2 = sub nsw i32 %add78.2, %add94.2
+  %add105.2 = add nsw i32 %sub102.2, %sub86.2
+  %sub106.2 = sub nsw i32 %sub86.2, %sub102.2
+  %shr.i.2 = lshr i32 %add103.2, 15
+  %and.i.2 = and i32 %shr.i.2, 65537
+  %mul.i.2 = mul nuw i32 %and.i.2, 65535
+  %add.i.2 = add i32 %mul.i.2, %add103.2
+  %xor.i.2 = xor i32 %add.i.2, %mul.i.2
+  %shr.i169.2 = lshr i32 %add105.2, 15
+  %and.i170.2 = and i32 %shr.i169.2, 65537
+  %mul.i171.2 = mul nuw i32 %and.i170.2, 65535
+  %add.i172.2 = add i32 %mul.i171.2, %add105.2
+  %xor.i173.2 = xor i32 %add.i172.2, %mul.i171.2
+  %shr.i174.2 = lshr i32 %sub104.2, 15
+  %and.i175.2 = and i32 %shr.i174.2, 65537
+  %mul.i176.2 = mul nuw i32 %and.i175.2, 65535
+  %add.i177.2 = add i32 %mul.i176.2, %sub104.2
+  %xor.i178.2 = xor i32 %add.i177.2, %mul.i176.2
+  %shr.i179.2 = lshr i32 %sub106.2, 15
+  %and.i180.2 = and i32 %shr.i179.2, 65537
+  %mul.i181.2 = mul nuw i32 %and.i180.2, 65535
+  %add.i182.2 = add i32 %mul.i181.2, %sub106.2
+  %xor.i183.2 = xor i32 %add.i182.2, %mul.i181.2
+  %add108.2 = add i32 %xor.i173.2, %add113.1
+  %add110.2 = add i32 %add108.2, %xor.i.2
+  %add112.2 = add i32 %add110.2, %xor.i178.2
+  %add113.2 = add i32 %add112.2, %xor.i183.2
+  %add78.3 = add nsw i32 %sub59.1, %sub59
+  %sub86.3 = sub nsw i32 %sub59, %sub59.1
+  %add94.3 = add nsw i32 %sub59.3, %sub59.2
+  %sub102.3 = sub nsw i32 %sub59.2, %sub59.3
+  %add103.3 = add nsw i32 %add94.3, %add78.3
+  %sub104.3 = sub nsw i32 %add78.3, %add94.3
+  %add105.3 = add nsw i32 %sub102.3, %sub86.3
+  %sub106.3 = sub nsw i32 %sub86.3, %sub102.3
+  %shr.i.3 = lshr i32 %add103.3, 15
+  %and.i.3 = and i32 %shr.i.3, 65537
+  %mul.i.3 = mul nuw i32 %and.i.3, 65535
+  %add.i.3 = add i32 %mul.i.3, %add103.3
+  %xor.i.3 = xor i32 %add.i.3, %mul.i.3
+  %shr.i169.3 = lshr i32 %add105.3, 15
+  %and.i170.3 = and i32 %shr.i169.3, 65537
+  %mul.i171.3 = mul nuw i32 %and.i170.3, 65535
+  %add.i172.3 = add i32 %mul.i171.3, %add105.3
+  %xor.i173.3 = xor i32 %add.i172.3, %mul.i171.3
+  %shr.i174.3 = lshr i32 %sub104.3, 15
+  %and.i175.3 = and i32 %shr.i174.3, 65537
+  %mul.i176.3 = mul nuw i32 %and.i175.3, 65535
+  %add.i177.3 = add i32 %mul.i176.3, %sub104.3
+  %xor.i178.3 = xor i32 %add.i177.3, %mul.i176.3
+  %shr.i179.3 = lshr i32 %sub106.3, 15
+  %and.i180.3 = and i32 %shr.i179.3, 65537
+  %mul.i181.3 = mul nuw i32 %and.i180.3, 65535
+  %add.i182.3 = add i32 %mul.i181.3, %sub106.3
+  %xor.i183.3 = xor i32 %add.i182.3, %mul.i181.3
+  %add108.3 = add i32 %xor.i173.3, %add113.2
+  %add110.3 = add i32 %add108.3, %xor.i.3
+  %add112.3 = add i32 %add110.3, %xor.i178.3
+  %add113.3 = add i32 %add112.3, %xor.i183.3
+  %conv118 = and i32 %add113.3, 65535
+  %shr = lshr i32 %add113.3, 16
+  %add119 = add nuw nsw i32 %conv118, %shr
+  %shr120 = lshr i32 %add119, 1
+  ret i32 %shr120
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/poison-within-divisions.ll b/llvm/test/Transforms/SLPVectorizer/X86/poison-within-divisions.ll
new file mode 100644
index 0000000..76ef396
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/poison-within-divisions.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt --passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(i1 %tobool2.not, i64 %conv21) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i1 [[TOBOOL2_NOT:%.*]], i64 [[CONV21:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[Q24_659:%.*]] = phi i32 [ [[Q24_655:%.*]], %[[IF_END35:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[L15_1:%.*]] = phi i32 [ [[L15_4:%.*]], %[[IF_END35]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL2_NOT]], label %[[IF_END4:.*]], label %[[Q:.*]]
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[Q24_659]], 0
+; CHECK-NEXT:    br label %[[AB:.*]]
+; CHECK:       [[AB]]:
+; CHECK-NEXT:    [[Q24_658:%.*]] = phi i32 [ [[Q24_660:%.*]], %[[IF_END35]] ], [ 0, %[[IF_END4]] ]
+; CHECK-NEXT:    [[M_1:%.*]] = phi i1 [ false, %[[IF_END35]] ], [ [[TMP0]], %[[IF_END4]] ]
+; CHECK-NEXT:    [[O_2:%.*]] = phi i32 [ [[O_7:%.*]], %[[IF_END35]] ], [ 0, %[[IF_END4]] ]
+; CHECK-NEXT:    [[Q24_2:%.*]] = phi i32 [ [[Q24_7:%.*]], %[[IF_END35]] ], [ 0, %[[IF_END4]] ]
+; CHECK-NEXT:    br i1 [[M_1]], label %[[AE:.*]], label %[[AC:.*]]
+; CHECK:       [[Q]]:
+; CHECK-NEXT:    [[TOBOOL16_NOT:%.*]] = icmp ne i32 [[L15_1]], 0
+; CHECK-NEXT:    [[SPEC_SELECT2:%.*]] = zext i1 [[TOBOOL16_NOT]] to i32
+; CHECK-NEXT:    br label %[[AE]]
+; CHECK:       [[AE]]:
+; CHECK-NEXT:    [[Q24_655]] = phi i32 [ [[Q24_658]], %[[AB]] ], [ 0, %[[Q]] ]
+; CHECK-NEXT:    [[M_3:%.*]] = phi i64 [ 0, %[[AB]] ], [ 1, %[[Q]] ]
+; CHECK-NEXT:    [[L15_4]] = phi i32 [ poison, %[[AB]] ], [ [[SPEC_SELECT2]], %[[Q]] ]
+; CHECK-NEXT:    [[O_4:%.*]] = phi i32 [ [[O_2]], %[[AB]] ], [ 0, %[[Q]] ]
+; CHECK-NEXT:    [[Q24_4:%.*]] = phi i32 [ [[Q24_2]], %[[AB]] ], [ 0, %[[Q]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL2_NOT]], label %[[IF_END35]], label %[[IF_THEN20:.*]]
+; CHECK:       [[IF_THEN20]]:
+; CHECK-NEXT:    [[DIV22:%.*]] = udiv i64 [[M_3]], [[CONV21]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[DIV22]] to i32
+; CHECK-NEXT:    [[CONV23:%.*]] = sub i32 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[M_3]] to i32
+; CHECK-NEXT:    [[CONV25:%.*]] = xor i32 [[TMP2]], 1
+; CHECK-NEXT:    br label %[[IF_END35]]
+; CHECK:       [[AC]]:
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TOBOOL2_NOT]], i32 [[Q24_2]], i32 [[O_2]]
+; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
+; CHECK:       [[IF_END35]]:
+; CHECK-NEXT:    [[Q24_660]] = phi i32 [ 0, %[[AE]] ], [ [[CONV25]], %[[IF_THEN20]] ]
+; CHECK-NEXT:    [[O_7]] = phi i32 [ [[O_4]], %[[AE]] ], [ [[CONV23]], %[[IF_THEN20]] ]
+; CHECK-NEXT:    [[Q24_7]] = phi i32 [ [[Q24_4]], %[[AE]] ], [ [[CONV25]], %[[IF_THEN20]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL2_NOT]], label %[[WHILE_BODY]], label %[[AB]]
+;
+entry:
+  br label %while.body
+
+while.body:
+  %q24.659 = phi i32 [ %q24.655, %if.end35 ], [ 0, %entry ]
+  %l15.1 = phi i32 [ %l15.4, %if.end35 ], [ 0, %entry ]
+  br i1 %tobool2.not, label %if.end4, label %q
+
+if.end4:
+  %0 = icmp eq i32 %q24.659, 0
+  br label %ab
+
+ab:
+  %q24.658 = phi i32 [ %q24.660, %if.end35 ], [ 0, %if.end4 ]
+  %m.1 = phi i1 [ false, %if.end35 ], [ %0, %if.end4 ]
+  %o.2 = phi i32 [ %o.7, %if.end35 ], [ 0, %if.end4 ]
+  %q24.2 = phi i32 [ %q24.7, %if.end35 ], [ 0, %if.end4 ]
+  br i1 %m.1, label %ae, label %ac
+
+q:
+  %tobool16.not = icmp ne i32 %l15.1, 0
+  %spec.select2 = zext i1 %tobool16.not to i32
+  br label %ae
+
+ae:
+  %q24.655 = phi i32 [ %q24.658, %ab ], [ 0, %q ]
+  %m.3 = phi i64 [ 0, %ab ], [ 1, %q ]
+  %l15.4 = phi i32 [ poison, %ab ], [ %spec.select2, %q ]
+  %o.4 = phi i32 [ %o.2, %ab ], [ 0, %q ]
+  %q24.4 = phi i32 [ %q24.2, %ab ], [ 0, %q ]
+  br i1 %tobool2.not, label %if.end35, label %if.then20
+
+if.then20:
+  %div22 = udiv i64 %m.3, %conv21
+  %1 = trunc i64 %div22 to i32
+  %conv23 = sub i32 0, %1
+  %2 = trunc i64 %m.3 to i32
+  %conv25 = xor i32 %2, 1
+  br label %if.end35
+
+ac:
+  %spec.select = select i1 %tobool2.not, i32 %q24.2, i32 %o.2
+  ret i32 %spec.select
+
+if.end35:
+  %q24.660 = phi i32 [ 0, %ae ], [ %conv25, %if.then20 ]
+  %o.7 = phi i32 [ %o.4, %ae ], [ %conv23, %if.then20 ]
+  %q24.7 = phi i32 [ %q24.4, %ae ], [ %conv25, %if.then20 ]
+  br i1 %tobool2.not, label %while.body, label %ab
+}
diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
index c858d07..ead6e02 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
@@ -219,4 +219,18 @@ entry:
 
 }
 
+define <1 x i32> @test_store_value_size_not_multiple_of_allocated_element_type_size(<1 x i16> %a, <1 x i16> %b) {
+entry:
+  %alloca = alloca [2 x i16]
+
+  %ptr0 = getelementptr inbounds [2 x i16], ptr %alloca, i32 0, i32 0
+  store <1 x i16> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds [2 x i16], ptr %alloca, i32 0, i32 1
+  store <1 x i16> %b, ptr %ptr1
+
+  %result = load <1 x i32>, ptr %alloca
+  ret <1 x i32> %result
+}
+
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index 56a0fa4..3daacfd 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -38,11 +38,6 @@ namespace llvm {
 extern cl::opt<std::string> OutputPrefix;
 }
 
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden);
-
 static cl::opt<std::string>
     OptCmd("opt-command", cl::init(""),
            cl::desc("Path to opt. (default: search path "
@@ -51,7 +46,7 @@ static cl::opt<std::string>
 /// This writes the current "Program" to the named bitcode file.  If an error
 /// occurs, true is returned.
 static bool writeProgramToFileAux(ToolOutputFile &Out, const Module &M) {
-  WriteBitcodeToFile(M, Out.os(), PreserveBitcodeUseListOrder);
+  WriteBitcodeToFile(M, Out.os(), /* ShouldPreserveUseListOrder */ true);
   Out.os().close();
   if (!Out.os().has_error()) {
     Out.keep();
@@ -68,7 +63,7 @@ bool BugDriver::writeProgramToFile(const std::string &Filename, int FD,
 
 bool BugDriver::writeProgramToFile(int FD, const Module &M) const {
   raw_fd_ostream OS(FD, /*shouldClose*/ false);
-  WriteBitcodeToFile(M, OS, PreserveBitcodeUseListOrder);
+  WriteBitcodeToFile(M, OS, /* ShouldPreserveUseListOrder */ true);
   OS.flush();
   if (!OS.has_error())
     return false;
@@ -155,7 +150,7 @@ bool BugDriver::runPasses(Module &Program,
   DiscardTemp Discard{*Temp};
   raw_fd_ostream OS(Temp->FD, /*shouldClose*/ false);
 
-  WriteBitcodeToFile(Program, OS, PreserveBitcodeUseListOrder);
+  WriteBitcodeToFile(Program, OS, /* ShouldPreserveUseListOrder */ true);
   OS.flush();
   if (OS.has_error()) {
     errs() << "Error writing bitcode file: " << Temp->TmpName << "\n";
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index 2164867..200e6a5 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -57,11 +57,6 @@ static cl::opt<bool>
                   cl::desc("Do not run verifier on input LLVM (dangerous!)"),
                   cl::cat(AsCat));
 
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden, cl::cat(AsCat));
-
 static cl::opt<std::string> ClDataLayout("data-layout",
                                          cl::desc("data layout string to use"),
                                          cl::value_desc("layout-string"),
@@ -100,7 +95,7 @@ static void WriteOutputFile(const Module *M, const ModuleSummaryIndex *Index) {
       // any non-null Index along with it as a per-module Index.
       // If both are empty, this will give an empty module block, which is
       // the expected behavior.
-      WriteBitcodeToFile(*M, Out->os(), PreserveBitcodeUseListOrder,
+      WriteBitcodeToFile(*M, Out->os(), /* ShouldPreserveUseListOrder */ true,
                          IndexToWrite, EmitModuleHash);
     else
       // Otherwise, with an empty Module but non-empty Index, we write a
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 2b43d27..35c5409 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -80,11 +80,6 @@ static cl::opt<bool>
                     cl::desc("Add informational comments to the .ll file"),
                     cl::cat(DisCategory));
 
-static cl::opt<bool> PreserveAssemblyUseListOrder(
-    "preserve-ll-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM assembly."),
-    cl::init(false), cl::Hidden, cl::cat(DisCategory));
-
 static cl::opt<bool>
     MaterializeMetadata("materialize-metadata",
                         cl::desc("Load module without materializing metadata, "
@@ -255,7 +250,8 @@ int main(int argc, char **argv) {
       if (!DontPrint) {
         if (M) {
           M->removeDebugIntrinsicDeclarations();
-          M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
+          M->print(Out->os(), Annotator.get(),
+                   /* ShouldPreserveUseListOrder */ false);
         }
         if (Index)
           Index->print(Out->os());
diff --git a/llvm/tools/llvm-extract/llvm-extract.cpp b/llvm/tools/llvm-extract/llvm-extract.cpp
index 69636ca..439a4a4 100644
--- a/llvm/tools/llvm-extract/llvm-extract.cpp
+++ b/llvm/tools/llvm-extract/llvm-extract.cpp
@@ -129,16 +129,6 @@ static cl::opt<bool> OutputAssembly("S",
                                     cl::desc("Write output as LLVM assembly"),
                                     cl::Hidden, cl::cat(ExtractCat));
 
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden, cl::cat(ExtractCat));
-
-static cl::opt<bool> PreserveAssemblyUseListOrder(
-    "preserve-ll-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM assembly."),
-    cl::init(false), cl::Hidden, cl::cat(ExtractCat));
-
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
 
@@ -421,9 +411,11 @@ int main(int argc, char **argv) {
   }
 
   if (OutputAssembly)
-    PM.addPass(PrintModulePass(Out.os(), "", PreserveAssemblyUseListOrder));
+    PM.addPass(
+        PrintModulePass(Out.os(), "", /* ShouldPreserveUseListOrder */ false));
   else if (Force || !CheckBitcodeOutputToConsole(Out.os()))
-    PM.addPass(BitcodeWriterPass(Out.os(), PreserveBitcodeUseListOrder));
+    PM.addPass(
+        BitcodeWriterPass(Out.os(), /* ShouldPreserveUseListOrder */ true));
 
   PM.run(*M, MAM);
 
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 22ea54e..93b1fb6 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -110,16 +110,6 @@ static cl::opt<bool> SuppressWarnings("suppress-warnings",
                                       cl::desc("Suppress all linking warnings"),
                                       cl::init(false), cl::cat(LinkCategory));
 
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden, cl::cat(LinkCategory));
-
-static cl::opt<bool> PreserveAssemblyUseListOrder(
-    "preserve-ll-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM assembly."),
-    cl::init(false), cl::Hidden, cl::cat(LinkCategory));
-
 static cl::opt<bool> NoVerify("disable-verify",
                               cl::desc("Do not run the verifier"), cl::Hidden,
                               cl::cat(LinkCategory));
@@ -525,9 +515,10 @@ int main(int argc, char **argv) {
     errs() << "Writing bitcode...\n";
   Composite->removeDebugIntrinsicDeclarations();
   if (OutputAssembly) {
-    Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
+    Composite->print(Out.os(), nullptr, /* ShouldPreserveUseListOrder */ false);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) {
-    WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
+    WriteBitcodeToFile(*Composite, Out.os(),
+                       /* ShouldPreserveUseListOrder */ true);
   }
 
   // Declare success.
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index d4fa6eb..2ac8de7 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -232,16 +232,6 @@ static cl::opt<std::string> ClDataLayout("data-layout",
                                          cl::value_desc("layout-string"),
                                          cl::init(""));
 
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden);
-
-static cl::opt<bool> PreserveAssemblyUseListOrder(
-    "preserve-ll-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM assembly."),
-    cl::init(false), cl::Hidden);
-
 static cl::opt<bool> RunTwice("run-twice",
                               cl::desc("Run all passes twice, re-using the "
                                        "same pass manager (legacy PM only)."),
@@ -753,9 +743,9 @@ extern "C" int optMain(
     return runPassPipeline(
                argv[0], *M, TM.get(), &TLII, Out.get(), ThinLinkOut.get(),
                RemarksFile.get(), Pipeline, PluginList, PassBuilderCallbacks,
-               OK, VK, PreserveAssemblyUseListOrder,
-               PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash,
-               EnableDebugify, VerifyDebugInfoPreserve,
+               OK, VK, /* ShouldPreserveAssemblyUseListOrder */ false,
+               /* ShouldPreserveBitcodeUseListOrder */ true, EmitSummaryIndex,
+               EmitModuleHash, EnableDebugify, VerifyDebugInfoPreserve,
                EnableProfileVerification, UnifiedLTO)
                ? 0
                : 1;
@@ -877,9 +867,11 @@ extern "C" int optMain(
       OS = BOS.get();
     }
     if (OutputAssembly)
-      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
+      Passes.add(createPrintModulePass(
+          *OS, "", /* ShouldPreserveAssemblyUseListOrder */ false));
     else
-      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
+      Passes.add(createBitcodeWriterPass(
+          *OS, /* ShouldPreserveBitcodeUseListOrder */ true));
   }
 
   // Before executing passes, print the final values of the LLVM options.
diff --git a/llvm/unittests/BinaryFormat/DwarfTest.cpp b/llvm/unittests/BinaryFormat/DwarfTest.cpp
index f4519f6..ba7d591 100644
--- a/llvm/unittests/BinaryFormat/DwarfTest.cpp
+++ b/llvm/unittests/BinaryFormat/DwarfTest.cpp
@@ -255,41 +255,186 @@ TEST(DwarfTest, lname_SourceLanguageNameString) {
 #include "llvm/BinaryFormat/Dwarf.def"
 }
 
-TEST(DWARFDebugInfo, TestLanguageDescription_Versioned) {
-  // Tests for the llvm::dwarf::LanguageDescription API that
-  // takes a name *and* a version.
-
-  // Unknown language.
-  EXPECT_EQ(
-      llvm::dwarf::LanguageDescription(static_cast<SourceLanguageName>(0)),
-      "Unknown");
-
-  EXPECT_EQ(
-      llvm::dwarf::LanguageDescription(static_cast<SourceLanguageName>(0), 0),
-      "Unknown");
-
-  // Test that specifying an invalid version falls back to a valid language name
-  // regardless.
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_ObjC, 0), "Objective C");
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_Julia, 0), "Julia");
-
-  // Check some versions.
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 199711),
-            "C++98");
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 201402),
-            "C++14");
-
-  // Versions round up.
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 201400),
-            "C++14");
-
-  // Version 0 for C and C++ is an unversioned name.
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C, 0), "C (K&R and ISO)");
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 0),
-            "ISO C++");
-
-  // Version 0 for other versioned languages may not be the unversioned name.
-  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_Fortran, 0),
-            "FORTRAN 77");
+struct LanguageDescriptionTestCase {
+  llvm::dwarf::SourceLanguageName LName;
+  uint32_t LVersion;
+  llvm::StringRef ExpectedDescription;
+};
+
+LanguageDescriptionTestCase LanguageDescriptionTestCases[] = {
+    {static_cast<SourceLanguageName>(0), 0, "Unknown"},
+    {static_cast<SourceLanguageName>(0), 1, "Unknown"},
+    {DW_LNAME_Ada, 0, "Ada 83"},
+    {DW_LNAME_Ada, 1982, "Ada 83"},
+    {DW_LNAME_Ada, 1983, "Ada 83"},
+    {DW_LNAME_Ada, 1994, "Ada 95"},
+    {DW_LNAME_Ada, 1995, "Ada 95"},
+    {DW_LNAME_Ada, 2004, "Ada 2005"},
+    {DW_LNAME_Ada, 2005, "Ada 2005"},
+    {DW_LNAME_Ada, 2011, "Ada 2012"},
+    {DW_LNAME_Ada, 2012, "Ada 2012"},
+    {DW_LNAME_Ada, 2013, "ISO Ada"},
+    {DW_LNAME_Cobol, 0, "COBOL-74"},
+    {DW_LNAME_Cobol, 1973, "COBOL-74"},
+    {DW_LNAME_Cobol, 1974, "COBOL-74"},
+    {DW_LNAME_Cobol, 1984, "COBOL-85"},
+    {DW_LNAME_Cobol, 1985, "COBOL-85"},
+    {DW_LNAME_Cobol, 1986, "ISO Cobol"},
+    {DW_LNAME_Fortran, 0, "FORTRAN 77"},
+    {DW_LNAME_Fortran, 1976, "FORTRAN 77"},
+    {DW_LNAME_Fortran, 1977, "FORTRAN 77"},
+    {DW_LNAME_Fortran, 1989, "FORTRAN 90"},
+    {DW_LNAME_Fortran, 1990, "FORTRAN 90"},
+    {DW_LNAME_Fortran, 1994, "Fortran 95"},
+    {DW_LNAME_Fortran, 1995, "Fortran 95"},
+    {DW_LNAME_Fortran, 2002, "Fortran 2003"},
+    {DW_LNAME_Fortran, 2003, "Fortran 2003"},
+    {DW_LNAME_Fortran, 2007, "Fortran 2008"},
+    {DW_LNAME_Fortran, 2008, "Fortran 2008"},
+    {DW_LNAME_Fortran, 2017, "Fortran 2018"},
+    {DW_LNAME_Fortran, 2018, "Fortran 2018"},
+    {DW_LNAME_Fortran, 2019, "ISO Fortran"},
+    {DW_LNAME_C, 0, "C (K&R and ISO)"},
+    {DW_LNAME_C, 198911, "C89"},
+    {DW_LNAME_C, 198912, "C89"},
+    {DW_LNAME_C, 199901, "C99"},
+    {DW_LNAME_C, 199902, "C11"},
+    {DW_LNAME_C, 201111, "C11"},
+    {DW_LNAME_C, 201112, "C11"},
+    {DW_LNAME_C, 201201, "C17"},
+    {DW_LNAME_C, 201709, "C17"},
+    {DW_LNAME_C, 201710, "C17"},
+    {DW_LNAME_C, 201711, "C (K&R and ISO)"},
+    {DW_LNAME_C_plus_plus, 0, "ISO C++"},
+    {DW_LNAME_C_plus_plus, 199710, "C++98"},
+    {DW_LNAME_C_plus_plus, 199711, "C++98"},
+    {DW_LNAME_C_plus_plus, 199712, "C++03"},
+    {DW_LNAME_C_plus_plus, 200310, "C++03"},
+    {DW_LNAME_C_plus_plus, 200311, "C++11"},
+    {DW_LNAME_C_plus_plus, 201102, "C++11"},
+    {DW_LNAME_C_plus_plus, 201103, "C++11"},
+    {DW_LNAME_C_plus_plus, 201104, "C++14"},
+    {DW_LNAME_C_plus_plus, 201401, "C++14"},
+    {DW_LNAME_C_plus_plus, 201402, "C++14"},
+    {DW_LNAME_C_plus_plus, 201403, "C++17"},
+    {DW_LNAME_C_plus_plus, 201702, "C++17"},
+    {DW_LNAME_C_plus_plus, 201703, "C++17"},
+    {DW_LNAME_C_plus_plus, 201704, "C++20"},
+    {DW_LNAME_C_plus_plus, 202001, "C++20"},
+    {DW_LNAME_C_plus_plus, 202002, "C++20"},
+    {DW_LNAME_C_plus_plus, 202003, "ISO C++"},
+    {DW_LNAME_ObjC_plus_plus, 0, LanguageDescription(DW_LNAME_ObjC_plus_plus)},
+    {DW_LNAME_ObjC_plus_plus, 1, LanguageDescription(DW_LNAME_ObjC_plus_plus)},
+    {DW_LNAME_ObjC, 0, LanguageDescription(DW_LNAME_ObjC)},
+    {DW_LNAME_ObjC, 1, LanguageDescription(DW_LNAME_ObjC)},
+    {DW_LNAME_Move, 0, LanguageDescription(DW_LNAME_Move)},
+    {DW_LNAME_Move, 1, LanguageDescription(DW_LNAME_Move)},
+    {DW_LNAME_SYCL, 0, LanguageDescription(DW_LNAME_SYCL)},
+    {DW_LNAME_SYCL, 1, LanguageDescription(DW_LNAME_SYCL)},
+    {DW_LNAME_BLISS, 0, LanguageDescription(DW_LNAME_BLISS)},
+    {DW_LNAME_BLISS, 1, LanguageDescription(DW_LNAME_BLISS)},
+    {DW_LNAME_Crystal, 0, LanguageDescription(DW_LNAME_Crystal)},
+    {DW_LNAME_Crystal, 1, LanguageDescription(DW_LNAME_Crystal)},
+    {DW_LNAME_D, 0, LanguageDescription(DW_LNAME_D)},
+    {DW_LNAME_D, 1, LanguageDescription(DW_LNAME_D)},
+    {DW_LNAME_Dylan, 0, LanguageDescription(DW_LNAME_Dylan)},
+    {DW_LNAME_Dylan, 1, LanguageDescription(DW_LNAME_Dylan)},
+    {DW_LNAME_Go, 0, LanguageDescription(DW_LNAME_Go)},
+    {DW_LNAME_Go, 1, LanguageDescription(DW_LNAME_Go)},
+    {DW_LNAME_Haskell, 0, LanguageDescription(DW_LNAME_Haskell)},
+    {DW_LNAME_Haskell, 1, LanguageDescription(DW_LNAME_Haskell)},
+    {DW_LNAME_HLSL, 0, LanguageDescription(DW_LNAME_HLSL)},
+    {DW_LNAME_HLSL, 1, LanguageDescription(DW_LNAME_HLSL)},
+    {DW_LNAME_Java, 0, LanguageDescription(DW_LNAME_Java)},
+    {DW_LNAME_Java, 1, LanguageDescription(DW_LNAME_Java)},
+    {DW_LNAME_Julia, 0, LanguageDescription(DW_LNAME_Julia)},
+    {DW_LNAME_Julia, 1, LanguageDescription(DW_LNAME_Julia)},
+    {DW_LNAME_Kotlin, 0, LanguageDescription(DW_LNAME_Kotlin)},
+    {DW_LNAME_Kotlin, 1, LanguageDescription(DW_LNAME_Kotlin)},
+    {DW_LNAME_Modula2, 0, LanguageDescription(DW_LNAME_Modula2)},
+    {DW_LNAME_Modula2, 1, LanguageDescription(DW_LNAME_Modula2)},
+    {DW_LNAME_Modula3, 0, LanguageDescription(DW_LNAME_Modula3)},
+    {DW_LNAME_Modula3, 1, LanguageDescription(DW_LNAME_Modula3)},
+    {DW_LNAME_OCaml, 0, LanguageDescription(DW_LNAME_OCaml)},
+    {DW_LNAME_OCaml, 1, LanguageDescription(DW_LNAME_OCaml)},
+    {DW_LNAME_OpenCL_C, 0, LanguageDescription(DW_LNAME_OpenCL_C)},
+    {DW_LNAME_OpenCL_C, 1, LanguageDescription(DW_LNAME_OpenCL_C)},
+    {DW_LNAME_Pascal, 0, LanguageDescription(DW_LNAME_Pascal)},
+    {DW_LNAME_Pascal, 1, LanguageDescription(DW_LNAME_Pascal)},
+    {DW_LNAME_PLI, 0, LanguageDescription(DW_LNAME_PLI)},
+    {DW_LNAME_PLI, 1, LanguageDescription(DW_LNAME_PLI)},
+    {DW_LNAME_Python, 0, LanguageDescription(DW_LNAME_Python)},
+    {DW_LNAME_Python, 1, LanguageDescription(DW_LNAME_Python)},
+    {DW_LNAME_RenderScript, 0, LanguageDescription(DW_LNAME_RenderScript)},
+    {DW_LNAME_RenderScript, 1, LanguageDescription(DW_LNAME_RenderScript)},
+    {DW_LNAME_Rust, 0, LanguageDescription(DW_LNAME_Rust)},
+    {DW_LNAME_Rust, 1, LanguageDescription(DW_LNAME_Rust)},
+    {DW_LNAME_Swift, 0, LanguageDescription(DW_LNAME_Swift)},
+    {DW_LNAME_Swift, 1, LanguageDescription(DW_LNAME_Swift)},
+    {DW_LNAME_UPC, 0, LanguageDescription(DW_LNAME_UPC)},
+    {DW_LNAME_UPC, 1, LanguageDescription(DW_LNAME_UPC)},
+    {DW_LNAME_Zig, 0, LanguageDescription(DW_LNAME_Zig)},
+    {DW_LNAME_Zig, 1, LanguageDescription(DW_LNAME_Zig)},
+    {DW_LNAME_Assembly, 0, LanguageDescription(DW_LNAME_Assembly)},
+    {DW_LNAME_Assembly, 1, LanguageDescription(DW_LNAME_Assembly)},
+    {DW_LNAME_C_sharp, 0, LanguageDescription(DW_LNAME_C_sharp)},
+    {DW_LNAME_C_sharp, 1, LanguageDescription(DW_LNAME_C_sharp)},
+    {DW_LNAME_Mojo, 0, LanguageDescription(DW_LNAME_Mojo)},
+    {DW_LNAME_Mojo, 1, LanguageDescription(DW_LNAME_Mojo)},
+    {DW_LNAME_GLSL, 0, LanguageDescription(DW_LNAME_GLSL)},
+    {DW_LNAME_GLSL, 1, LanguageDescription(DW_LNAME_GLSL)},
+    {DW_LNAME_GLSL_ES, 0, LanguageDescription(DW_LNAME_GLSL_ES)},
+    {DW_LNAME_GLSL_ES, 1, LanguageDescription(DW_LNAME_GLSL_ES)},
+    {DW_LNAME_OpenCL_CPP, 0, LanguageDescription(DW_LNAME_OpenCL_CPP)},
+    {DW_LNAME_OpenCL_CPP, 1, LanguageDescription(DW_LNAME_OpenCL_CPP)},
+    {DW_LNAME_CPP_for_OpenCL, 0, LanguageDescription(DW_LNAME_CPP_for_OpenCL)},
+    {DW_LNAME_CPP_for_OpenCL, 1, LanguageDescription(DW_LNAME_CPP_for_OpenCL)},
+    {DW_LNAME_Ruby, 0, LanguageDescription(DW_LNAME_Ruby)},
+    {DW_LNAME_Ruby, 1, LanguageDescription(DW_LNAME_Ruby)},
+    {DW_LNAME_Hylo, 0, LanguageDescription(DW_LNAME_Hylo)},
+    {DW_LNAME_Hylo, 1, LanguageDescription(DW_LNAME_Hylo)},
+    {DW_LNAME_Metal, 0, LanguageDescription(DW_LNAME_Metal)},
+    {DW_LNAME_Metal, 1, LanguageDescription(DW_LNAME_Metal)}};
+
+struct LanguageDescriptionTestFixture
+    : public testing::Test,
+      public testing::WithParamInterface<LanguageDescriptionTestCase> {};
+
+TEST_P(LanguageDescriptionTestFixture, TestLanguageDescription) {
+  auto [LName, LVersion, ExpectedDescription] = GetParam();
+
+  // Basic test.
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(LName, LVersion),
+            ExpectedDescription);
+
+  // Now do the same test but roundtrip through the DW_LANG_ <-> DW_LNAME_
+  // conversion APIs first.
+
+  auto DWLang = llvm::dwarf::toDW_LANG(LName, LVersion);
+  // Some languages are not 1-to-1 mapped. In which case there's nothing else
+  // to test.
+  if (!DWLang)
+    return;
+
+  std::optional<std::pair<SourceLanguageName, uint32_t>> DWLName =
+      llvm::dwarf::toDW_LNAME(*DWLang);
+
+  // We are roundtripping, so there definitely should be a mapping back to
+  // DW_LNAME_.
+  ASSERT_TRUE(DWLName);
+
+  // There is no official DW_LANG_ code for C++98. So the roundtripping turns it
+  // into a plain DW_LANG_C_plus_plus.
+  if (DWLang == DW_LANG_C_plus_plus && LVersion <= 199711)
+    EXPECT_EQ(llvm::dwarf::LanguageDescription(DWLName->first, DWLName->second),
+              "ISO C++");
+  else
+    EXPECT_EQ(llvm::dwarf::LanguageDescription(DWLName->first, DWLName->second),
+              ExpectedDescription);
 }
+
+INSTANTIATE_TEST_SUITE_P(LanguageDescriptionTests,
+                         LanguageDescriptionTestFixture,
+                         ::testing::ValuesIn(LanguageDescriptionTestCases));
+
 } // end namespace
diff --git a/llvm/unittests/IR/ConstantFPRangeTest.cpp b/llvm/unittests/IR/ConstantFPRangeTest.cpp
index 1436f0f..58a65b9 100644
--- a/llvm/unittests/IR/ConstantFPRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantFPRangeTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/ConstantFPRange.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
 #include "gtest/gtest.h"
@@ -802,4 +803,126 @@ TEST_F(ConstantFPRangeTest, negate) {
             ConstantFPRange::getNonNaN(APFloat(-2.0), APFloat(3.0)));
 }
 
+TEST_F(ConstantFPRangeTest, getWithout) {
+  EXPECT_EQ(Full.getWithoutNaN(), ConstantFPRange::getNonNaN(Sem));
+  EXPECT_EQ(NaN.getWithoutNaN(), Empty);
+
+  EXPECT_EQ(NaN.getWithoutInf(), NaN);
+  EXPECT_EQ(PosInf.getWithoutInf(), Empty);
+  EXPECT_EQ(NegInf.getWithoutInf(), Empty);
+  EXPECT_EQ(ConstantFPRange::getNonNaN(Sem).getWithoutInf(), Finite);
+  EXPECT_EQ(Zero.getWithoutInf(), Zero);
+  EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat::getInf(Sem, /*Negative=*/true),
+                                       APFloat(3.0))
+                .getWithoutInf(),
+            ConstantFPRange::getNonNaN(
+                APFloat::getLargest(Sem, /*Negative=*/true), APFloat(3.0)));
+}
+
+TEST_F(ConstantFPRangeTest, cast) {
+  const fltSemantics &F16Sem = APFloat::IEEEhalf();
+  const fltSemantics &BF16Sem = APFloat::BFloat();
+  const fltSemantics &F32Sem = APFloat::IEEEsingle();
+  const fltSemantics &F8NanOnlySem = APFloat::Float8E4M3FN();
+  // normal -> normal (exact)
+  EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat(1.0), APFloat(2.0)).cast(F32Sem),
+            ConstantFPRange::getNonNaN(APFloat(1.0f), APFloat(2.0f)));
+  EXPECT_EQ(
+      ConstantFPRange::getNonNaN(APFloat(-2.0f), APFloat(-1.0f)).cast(Sem),
+      ConstantFPRange::getNonNaN(APFloat(-2.0), APFloat(-1.0)));
+  // normal -> normal (inexact)
+  EXPECT_EQ(
+      ConstantFPRange::getNonNaN(APFloat(3.141592653589793),
+                                 APFloat(6.283185307179586))
+          .cast(F32Sem),
+      ConstantFPRange::getNonNaN(APFloat(3.14159274f), APFloat(6.28318548f)));
+  // normal -> subnormal
+  EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat(-5e-8), APFloat(5e-8))
+                .cast(F16Sem)
+                .classify(),
+            fcSubnormal | fcZero);
+  // normal -> zero
+  EXPECT_EQ(ConstantFPRange::getNonNaN(
+                APFloat::getSmallestNormalized(Sem, /*Negative=*/true),
+                APFloat::getSmallestNormalized(Sem, /*Negative=*/false))
+                .cast(F32Sem)
+                .classify(),
+            fcZero);
+  // normal -> inf
+  EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat(-65536.0), APFloat(65536.0))
+                .cast(F16Sem),
+            ConstantFPRange::getNonNaN(F16Sem));
+  // nan -> qnan
+  EXPECT_EQ(
+      ConstantFPRange::getNaNOnly(Sem, /*MayBeQNaN=*/true, /*MayBeSNaN=*/false)
+          .cast(F32Sem),
+      ConstantFPRange::getNaNOnly(F32Sem, /*MayBeQNaN=*/true,
+                                  /*MayBeSNaN=*/false));
+  EXPECT_EQ(
+      ConstantFPRange::getNaNOnly(Sem, /*MayBeQNaN=*/false, /*MayBeSNaN=*/true)
+          .cast(F32Sem),
+      ConstantFPRange::getNaNOnly(F32Sem, /*MayBeQNaN=*/true,
+                                  /*MayBeSNaN=*/false));
+  EXPECT_EQ(
+      ConstantFPRange::getNaNOnly(Sem, /*MayBeQNaN=*/true, /*MayBeSNaN=*/true)
+          .cast(F32Sem),
+      ConstantFPRange::getNaNOnly(F32Sem, /*MayBeQNaN=*/true,
+                                  /*MayBeSNaN=*/false));
+  // For BF16 -> F32, signaling bit is still lost.
+  EXPECT_EQ(ConstantFPRange::getNaNOnly(BF16Sem, /*MayBeQNaN=*/true,
+                                        /*MayBeSNaN=*/true)
+                .cast(F32Sem),
+            ConstantFPRange::getNaNOnly(F32Sem, /*MayBeQNaN=*/true,
+                                        /*MayBeSNaN=*/false));
+  // inf -> nan only (return full set for now)
+  EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat::getInf(Sem, /*Negative=*/true),
+                                       APFloat::getInf(Sem, /*Negative=*/false))
+                .cast(F8NanOnlySem),
+            ConstantFPRange::getFull(F8NanOnlySem));
+  // other rounding modes
+  EXPECT_EQ(
+      ConstantFPRange::getNonNaN(APFloat::getSmallest(Sem, /*Negative=*/true),
+                                 APFloat::getSmallest(Sem, /*Negative=*/false))
+          .cast(F32Sem, APFloat::rmTowardNegative),
+      ConstantFPRange::getNonNaN(
+          APFloat::getSmallest(F32Sem, /*Negative=*/true),
+          APFloat::getZero(F32Sem, /*Negative=*/false)));
+  EXPECT_EQ(
+      ConstantFPRange::getNonNaN(APFloat::getSmallest(Sem, /*Negative=*/true),
+                                 APFloat::getSmallest(Sem, /*Negative=*/false))
+          .cast(F32Sem, APFloat::rmTowardPositive),
+      ConstantFPRange::getNonNaN(
+          APFloat::getZero(F32Sem, /*Negative=*/true),
+          APFloat::getSmallest(F32Sem, /*Negative=*/false)));
+  EXPECT_EQ(
+      ConstantFPRange::getNonNaN(
+          APFloat::getSmallestNormalized(Sem, /*Negative=*/true),
+          APFloat::getSmallestNormalized(Sem, /*Negative=*/false))
+          .cast(F32Sem, APFloat::rmTowardZero),
+      ConstantFPRange::getNonNaN(APFloat::getZero(F32Sem, /*Negative=*/true),
+                                 APFloat::getZero(F32Sem, /*Negative=*/false)));
+
+  EnumerateValuesInConstantFPRange(
+      ConstantFPRange::getFull(APFloat::Float8E4M3()),
+      [&](const APFloat &V) {
+        bool LosesInfo = false;
+
+        APFloat DoubleV = V;
+        DoubleV.convert(Sem, APFloat::rmNearestTiesToEven, &LosesInfo);
+        ConstantFPRange DoubleCR = ConstantFPRange(V).cast(Sem);
+        EXPECT_TRUE(DoubleCR.contains(DoubleV))
+            << "Casting " << V << " to double failed. " << DoubleCR
+            << " doesn't contain " << DoubleV;
+
+        auto &FP4Sem = APFloat::Float4E2M1FN();
+        APFloat FP4V = V;
+        FP4V.convert(FP4Sem, APFloat::rmNearestTiesToEven, &LosesInfo);
+        ConstantFPRange FP4CR = ConstantFPRange(V).cast(FP4Sem);
+        EXPECT_TRUE(FP4CR.contains(FP4V))
+            << "Casting " << V << " to FP4E2M1FN failed. " << FP4CR
+            << " doesn't contain " << FP4V;
+      },
+      /*IgnoreNaNPayload=*/true);
+}
+
 } // anonymous namespace
diff --git a/llvm/unittests/Support/raw_ostream_test.cpp b/llvm/unittests/Support/raw_ostream_test.cpp
index fbeff37..8f9ed41 100644
--- a/llvm/unittests/Support/raw_ostream_test.cpp
+++ b/llvm/unittests/Support/raw_ostream_test.cpp
@@ -626,6 +626,11 @@ TEST(raw_ostreamTest, writeToDevNull) {
   EXPECT_TRUE(DevNullIsUsed);
 }
 
+TEST(raw_ostreamTest, nullStreamZeroBufferSize) {
+  raw_ostream &NullStream = nulls();
+  EXPECT_EQ(NullStream.GetBufferSize(), 0u);
+}
+
 TEST(raw_ostreamTest, writeToStdOut) {
   outs().flush();
   testing::internal::CaptureStdout();
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
index eb8aef2..5f9eb9a 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
@@ -27,6 +27,8 @@ static_library("Analysis") {
     "FixitUtil.cpp",
     "IntervalPartition.cpp",
     "IssueHash.cpp",
+    "LifetimeAnnotations.cpp",
+    "LifetimeSafety.cpp",
     "LiveVariables.cpp",
     "MacroExpansionContext.cpp",
     "ObjCNoReturn.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
deleted file mode 100644
index a148e78..0000000
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
+++ /dev/null
@@ -1,16 +0,0 @@
-static_library("LifetimeSafety") {
-  output_name = "clangAnalysisLifetimeSafety"
-  configs += [ "//llvm/utils/gn/build:clang_code" ]
-  deps = [ "//clang/lib/Basic" ]
-  sources = [
-    "Checker.cpp",
-    "Facts.cpp",
-    "FactsGenerator.cpp",
-    "LifetimeAnnotations.cpp",
-    "LifetimeSafety.cpp",
-    "LiveOrigins.cpp",
-    "Loans.cpp",
-    "LoanPropagation.cpp",
-    "Origins.cpp",
-  ]
-}
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index 14f44c4..96ff481 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -29,7 +29,6 @@ static_library("Sema") {
     "//clang/lib/APINotes",
     "//clang/lib/AST",
     "//clang/lib/Analysis",
-    "//clang/lib/Analysis/LifetimeSafety",
     "//clang/lib/Basic",
     "//clang/lib/Edit",
     "//clang/lib/Lex",
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 8c41466..39ff476 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -10,7 +10,7 @@ CodeGen/AArch64/llvm-masked-gather-legal-for-sve.ll
 CodeGen/AArch64/llvm-masked-scatter-legal-for-sve.ll
 CodeGen/AArch64/selectopt-cast.ll
 CodeGen/AArch64/selectopt.ll
-CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
 CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
 CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
 CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
@@ -75,6 +75,7 @@ CodeGen/NVPTX/lower-ctor-dtor.ll
 CodeGen/RISCV/zmmul.ll
 CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
 CodeGen/WebAssembly/memory-interleave.ll
+CodeGen/X86/global-variable-partition-with-dap.ll
 CodeGen/X86/masked_gather_scatter.ll
 CodeGen/X86/nocfivalue.ll
 DebugInfo/AArch64/ir-outliner.ll
@@ -84,7 +85,6 @@ DebugInfo/KeyInstructions/Generic/loop-unswitch.ll
 DebugInfo/X86/asan_debug_info.ll
 Instrumentation/AddressSanitizer/aarch64be.ll
 Instrumentation/AddressSanitizer/adaptive_global_redzones.ll
-Instrumentation/AddressSanitizer/alloca-offset-lifetime.ll
 Instrumentation/AddressSanitizer/AMDGPU/adaptive_constant_global_redzones.ll
 Instrumentation/AddressSanitizer/AMDGPU/adaptive_global_redzones.ll
 Instrumentation/AddressSanitizer/AMDGPU/asan_do_not_instrument_lds.ll
@@ -548,12 +548,6 @@ tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values.test
 tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test
 tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
-Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
-Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll
-Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
-Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll
-Transforms/AggressiveInstCombine/trunc_select_cmp.ll
-Transforms/AggressiveInstCombine/trunc_select.ll
 Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
 Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
 Transforms/AtomicExpand/AArch64/pcsections.ll
@@ -818,7 +812,6 @@ Transforms/InstCombine/AMDGPU/addrspacecast.ll
 Transforms/InstCombine/and2.ll
 Transforms/InstCombine/and-fcmp.ll
 Transforms/InstCombine/and.ll
-Transforms/InstCombine/and-or-icmp-nullptr.ll
 Transforms/InstCombine/and-or-icmps.ll
 Transforms/InstCombine/and-or-implied-cond-not.ll
 Transforms/InstCombine/apint-div1.ll
@@ -1259,7 +1252,6 @@ Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
 Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
 Transforms/PhaseOrdering/AArch64/quant_4x4.ll
 Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
-Transforms/PhaseOrdering/lower-table-based-cttz.ll
 Transforms/PhaseOrdering/vector-select.ll
 Transforms/PhaseOrdering/X86/blendv-select.ll
 Transforms/PhaseOrdering/X86/merge-functions2.ll